def main(): # Parse command line options, args = doc_optparse.parse( __doc__ ) try: maf_file = args[0] # If it appears to be a bz2 file, attempt to open with table if maf_file.endswith( ".bz2" ): table_file = maf_file + "t" if not os.path.exists( table_file ): doc_optparse.exit( "To index bz2 compressed files first " "create a bz2t file with bzip-table." ) # Open with SeekableBzip2File so we have tell support maf_in = SeekableBzip2File( maf_file, table_file ) # Strip .bz2 from the filename before adding ".index" maf_file = maf_file[:-4] elif maf_file.endswith( ".lzo" ): from bx.misc.seeklzop import SeekableLzopFile table_file = maf_file + "t" if not os.path.exists( table_file ): doc_optparse.exit( "To index lzo compressed files first " "create a lzot file with lzop_build_offset_table." ) # Open with SeekableBzip2File so we have tell support maf_in = SeekableLzopFile( maf_file, table_file ) # Strip .lzo from the filename before adding ".index" maf_file = maf_file[:-4] else: maf_in = open( maf_file ) # Determine the name of the index file if len( args ) > 1: index_file = args[1] else: index_file = maf_file + ".index" if options.species: species = options.species.split( "," ) else: species = None except: doc_optparse.exception() maf_reader = bx.align.maf.Reader( maf_in ) indexes = interval_index_file.Indexes() # Need to be a bit tricky in our iteration here to get the 'tells' right while 1: pos = maf_reader.file.tell() block = maf_reader.next() if block is None: break for c in block.components: if species is not None and c.src.split('.')[0] not in species: continue indexes.add( c.src, c.forward_strand_start, c.forward_strand_end, pos, max=c.src_size ) out = open( index_file, 'w' ) indexes.write( out ) out.close()
def main(): # Parse command line options, args = doc_optparse.parse(__doc__) if options.version: return try: wiggle_file = args[0] # If it appears to be a bz2 file, attempt to open with table if wiggle_file.endswith(".bz2"): table_file = wiggle_file + "t" if not os.path.exists(table_file): doc_optparse.exit("To index bz2 compressed files first " "create a bz2t file with bzip-table.") # Open with SeekableBzip2File so we have tell support wiggle_in = SeekableBzip2File(wiggle_file, table_file) # Strip .bz2 from the filename before adding ".index" wiggle_file = wiggle_file[:-4] elif wiggle_file.endswith(".lzo"): from bx.misc.seeklzop import SeekableLzopFile table_file = wiggle_file + "t" if not os.path.exists(table_file): doc_optparse.exit("To index lzo compressed files first " "create a lzot file with bzip-table.") # Open with SeekableBzip2File so we have tell support wiggle_in = SeekableLzopFile(wiggle_file, table_file) # Strip .lzo from the filename before adding ".index" wiggle_file = wiggle_file[:-4] else: wiggle_in = open(wiggle_file) # Determine the name of the index file if len(args) > 1: index_file = args[1] else: index_file = wiggle_file + ".index" except: doc_optparse.exception() indexes = interval_index_file.Indexes() # Can't use the iterator, as there is no next() and thus # no way to access the positions. The following code is # modified from wiggle.py last_chrom = None start = None end = None first_pos = None # always for wiggle data strand = '+' mode = "bed" while 1: pos = wiggle_in.tell() line = wiggle_in.readline() if not line: break if line.isspace() or line.startswith("track") or line.startswith( "#") or line.startswith("browser"): continue elif line.startswith("bed"): indexes.add(fields[0], int(fields[1]), int(fields[2]), pos) elif line.startswith("variableStep") or line.startswith("fixedStep"): if first_pos != None: indexes.add(last_chrom, start, end, first_pos) first_pos = pos header = bx.wiggle.parse_header(line) last_chrom = header['chrom'] start = int(header['start']) - 1 end = start current_step = None if 'span' in header: current_span = int(header['span']) else: current_span = 1 if 'step' in header: current_step = int(header['step']) if line.startswith("variableStep"): mode = "variableStep" else: mode = "fixedStep" elif mode == "variableStep": fields = line.split() end = int(fields[0]) - 1 + current_span elif mode == "fixedStep": end += current_step else: raise "Unexpected input line: %s" % line.strip() out = open(index_file, 'w') indexes.write(out) out.close()
import argparse import marshal from bx.misc.seekbzip2 import SeekableBzip2File parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('bz2', help='bz2 dump file') parser.add_argument('bz2t', help='bzip-table file') parser.add_argument('output', help='destination file') parser.add_argument('--offsets-only', action='store_true') args = parser.parse_args() index = {} dump = SeekableBzip2File(args.bz2, args.bz2t) offset = 0 try: for line in dump: if line == ' <page>\n': start = offset elif line.startswith(' <title>'): title = line[11:-9] index[title] = start offset = dump.tell() finally: dump.close() if args.offsets_only: index = tuple(index)
""" import atexit import marshal import ujson from bottle import request, response, route, run from bx.misc.seekbzip2 import SeekableBzip2File from xml.sax.saxutils import unescape config = { 'filename': '/run/data/en-2012-09-10.xml.bz2', 'table_filename': '/run/data/en-2012-09-10.xml.bz2t', 'index': '/run/data/en-2012-09-10.xml.marshal' } dump = SeekableBzip2File(**config) atexit.register(dump.close) with open(config['index'], 'rb') as f: index = marshal.load(f) http_headers = { 'Cache-Control': 'public, max-age=31536000', 'Content-Type': 'application/json; charset=utf-8' } def get_page(title): offset = index.get(title) if offset is None: return {'-1': {'ns': 0, 'title': title, 'missing': ''}}