Ejemplo n.º 1
0
def main():

    # Parse command line

    options, args = doc_optparse.parse( __doc__ )

    try:
        maf_file = args[0]
        # If it appears to be a bz2 file, attempt to open with table
        if maf_file.endswith( ".bz2" ):
            table_file = maf_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index bz2 compressed files first "
                                   "create a bz2t file with bzip-table." )
            # Open with SeekableBzip2File so we have tell support
            maf_in = SeekableBzip2File( maf_file, table_file )
            # Strip .bz2 from the filename before adding ".index"
            maf_file = maf_file[:-4]
        elif maf_file.endswith( ".lzo" ):
            from bx.misc.seeklzop import SeekableLzopFile
            table_file = maf_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index lzo compressed files first "
                                   "create a lzot file with lzop_build_offset_table." )
            # Open with SeekableBzip2File so we have tell support
            maf_in = SeekableLzopFile( maf_file, table_file )
            # Strip .lzo from the filename before adding ".index"
            maf_file = maf_file[:-4]
        else:
            maf_in = open( maf_file )
        # Determine the name of the index file
        if len( args ) > 1: 
            index_file = args[1]
        else: 
            index_file = maf_file + ".index" 
        if options.species:
            species = options.species.split( "," )
        else:
            species = None
    except:
        doc_optparse.exception()

    maf_reader = bx.align.maf.Reader( maf_in )

    indexes = interval_index_file.Indexes()

    # Need to be a bit tricky in our iteration here to get the 'tells' right
    while 1:
        pos = maf_reader.file.tell()
        block = maf_reader.next()
        if block is None: break
        for c in block.components:
            if species is not None and c.src.split('.')[0] not in species:
                continue
            indexes.add( c.src, c.forward_strand_start, c.forward_strand_end, pos, max=c.src_size )

    out = open( index_file, 'w' )
    indexes.write( out )
    out.close()
Ejemplo n.º 2
0
def main():

    # Parse command line

    options, args = doc_optparse.parse(__doc__)
    if options.version: return

    try:
        wiggle_file = args[0]
        # If it appears to be a bz2 file, attempt to open with table
        if wiggle_file.endswith(".bz2"):
            table_file = wiggle_file + "t"
            if not os.path.exists(table_file):
                doc_optparse.exit("To index bz2 compressed files first "
                                  "create a bz2t file with bzip-table.")
            # Open with SeekableBzip2File so we have tell support
            wiggle_in = SeekableBzip2File(wiggle_file, table_file)
            # Strip .bz2 from the filename before adding ".index"
            wiggle_file = wiggle_file[:-4]
        elif wiggle_file.endswith(".lzo"):
            from bx.misc.seeklzop import SeekableLzopFile
            table_file = wiggle_file + "t"
            if not os.path.exists(table_file):
                doc_optparse.exit("To index lzo compressed files first "
                                  "create a lzot file with bzip-table.")
            # Open with SeekableBzip2File so we have tell support
            wiggle_in = SeekableLzopFile(wiggle_file, table_file)
            # Strip .lzo from the filename before adding ".index"
            wiggle_file = wiggle_file[:-4]
        else:
            wiggle_in = open(wiggle_file)
        # Determine the name of the index file
        if len(args) > 1:
            index_file = args[1]
        else:
            index_file = wiggle_file + ".index"
    except:
        doc_optparse.exception()

    indexes = interval_index_file.Indexes()

    # Can't use the iterator, as there is no next() and thus
    # no way to access the positions. The following code is
    # modified from wiggle.py
    last_chrom = None
    start = None
    end = None
    first_pos = None

    # always for wiggle data
    strand = '+'

    mode = "bed"

    while 1:
        pos = wiggle_in.tell()
        line = wiggle_in.readline()
        if not line: break

        if line.isspace() or line.startswith("track") or line.startswith(
                "#") or line.startswith("browser"):
            continue
        elif line.startswith("bed"):
            indexes.add(fields[0], int(fields[1]), int(fields[2]), pos)
        elif line.startswith("variableStep") or line.startswith("fixedStep"):
            if first_pos != None:
                indexes.add(last_chrom, start, end, first_pos)
            first_pos = pos
            header = bx.wiggle.parse_header(line)
            last_chrom = header['chrom']
            start = int(header['start']) - 1
            end = start
            current_step = None
            if 'span' in header:
                current_span = int(header['span'])
            else:
                current_span = 1
            if 'step' in header:
                current_step = int(header['step'])

            if line.startswith("variableStep"):
                mode = "variableStep"
            else:
                mode = "fixedStep"
        elif mode == "variableStep":
            fields = line.split()
            end = int(fields[0]) - 1 + current_span
        elif mode == "fixedStep":
            end += current_step
        else:
            raise "Unexpected input line: %s" % line.strip()

    out = open(index_file, 'w')
    indexes.write(out)
    out.close()
import argparse
import marshal

from bx.misc.seekbzip2 import SeekableBzip2File


parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('bz2', help='bz2 dump file')
parser.add_argument('bz2t', help='bzip-table file')
parser.add_argument('output', help='destination file')
parser.add_argument('--offsets-only', action='store_true')
args = parser.parse_args()


index = {}
dump = SeekableBzip2File(args.bz2, args.bz2t)
offset = 0
try:
    for line in dump:
        if line == '  <page>\n':
            start = offset
        elif line.startswith('    <title>'):
            title = line[11:-9]
            index[title] = start
        offset = dump.tell()
finally:
    dump.close()

if args.offsets_only:
    index = tuple(index)
Ejemplo n.º 4
0
"""
import atexit
import marshal
import ujson

from bottle import request, response, route, run
from bx.misc.seekbzip2 import SeekableBzip2File
from xml.sax.saxutils import unescape

config = {
    'filename': '/run/data/en-2012-09-10.xml.bz2',
    'table_filename': '/run/data/en-2012-09-10.xml.bz2t',
    'index': '/run/data/en-2012-09-10.xml.marshal'
}

dump = SeekableBzip2File(**config)
atexit.register(dump.close)

with open(config['index'], 'rb') as f:
    index = marshal.load(f)

http_headers = {
    'Cache-Control': 'public, max-age=31536000',
    'Content-Type': 'application/json; charset=utf-8'
}


def get_page(title):
    offset = index.get(title)
    if offset is None:
        return {'-1': {'ns': 0, 'title': title, 'missing': ''}}