def default_docs(func): """ A decorator which automatically takes care of default parameter documentation for common pipeline factory parameters """ docs = func.__doc__ new_docs = "" signature = funcsigs.signature(func) try: buf = StringIO(docs) line = True while line: line = buf.readline() if "Parameters" in line: artificial_param_docs = [line, buf.readline()] # Include the `-----` line for param in signature.parameters.keys(): doc = _PARAMETER_MAPPING.get(param, None) if doc: if not doc.endswith("\n"): doc += "\n" if doc.startswith("\n"): doc = doc[1:] artificial_param_docs.append(doc) new_docs += "".join(artificial_param_docs) continue new_docs = "".join([new_docs, line]) func.__doc__ = new_docs except Exception as ex: func.__doc__ = str(ex) return func
class _FileWrapper(object): def set_file(self, filename): self.f = open(filename, 'r') def set_stdin(self): self.f = sys.stdin def set_text(self, text): from six.moves import cStringIO as StringIO self.f = StringIO(text) def readline(self): return self.f.readline() def close(self): self.f.close()
class FileCache( Iterator ): """ Wrapper for a file that cache blocks of data in memory. **NOTE:** this is currently an incomplete file-like object, it only supports seek, tell, and readline (plus iteration). Reading bytes is currently not implemented. """ def __init__( self, file, size, cache_size=DEFAULT_CACHE_SIZE, block_size=DEFAULT_BLOCK_SIZE ): """ Create a new `FileCache` wrapping the file-like object `file` that has total size `size` and caching blocks of size `block_size`. """ self.file = file self.size = size self.cache_size = cache_size self.block_size = block_size # Setup the cache self.nblocks = ( self.size // self.block_size ) + 1 self.cache = LRUCache( self.cache_size ) # Position in file self.dirty = True self.at_eof = False self.file_pos = 0 self.current_block_index = -1 self.current_block = None def fix_dirty( self ): chunk, offset = self.get_block_and_offset( self.file_pos ) if self.current_block_index != chunk: self.current_block = StringIO( self.load_block( chunk ) ) self.current_block.read( offset ) self.current_block_index = chunk else: self.current_block.seek( offset ) self.dirty = False def get_block_and_offset( self, index ): return int( index // self.block_size ), int( index % self.block_size ) def load_block( self, index ): if index in self.cache: return self.cache[index] else: real_offset = index * self.block_size self.file.seek( real_offset ) block = self.file.read( self.block_size ) self.cache[index] = block return block def seek( self, offset, whence=0 ): """ Move the file pointer to a particular offset. """ # Determine absolute target position if whence == 0: target_pos = offset elif whence == 1: target_pos = self.file_pos + offset elif whence == 2: target_pos = self.size - offset else: raise Exception( "Invalid `whence` argument: %r", whence ) # Check if this is a noop if target_pos == self.file_pos: return # Verify it is valid assert 0 <= target_pos < self.size, "Attempt to seek outside file" # Move the position self.file_pos = target_pos # Mark as dirty, the next time a read is done we need to actually # move the position in the bzip2 file self.dirty = True def readline( self ): if self.dirty: self.fix_dirty() if self.at_eof: return "" rval = [] while 1: line = self.current_block.readline() rval.append( line ) if len( line ) > 0 and line[-1] == '\n': break elif self.current_block_index == self.nblocks - 1: self.at_eof = True break else: self.current_block_index += 1 self.current_block = StringIO( self.load_block( self.current_block_index ) ) return "".join( rval ) def __next__( self ): line = self.readline() if line == "": raise StopIteration def __iter__( self ): return self def close( self ): self.file.close()
class SeekableLzopFile(Iterator): """ Filelike object supporting read-only semi-random access to bz2 compressed files for which an offset table (bz2t) has been generated by `bzip-table`. """ def __init__(self, filename, table_filename, block_cache_size=0, **kwargs): self.filename = filename self.table_filename = table_filename self.init_table() self.file = open(self.filename, "r") self.dirty = True self.at_eof = False self.file_pos = 0 self.current_block_index = -1 self.current_block = None if block_cache_size > 0: self.cache = lrucache.LRUCache(block_cache_size) else: self.cache = None def init_table(self): self.block_size = None self.block_info = [] # Position of corresponding block in compressed file (in bytes) for line in open(self.table_filename): fields = line.split() if fields[0] == "s": self.block_size = int(fields[1]) if fields[0] == "o": offset = int(fields[1]) compressed_size = int(fields[2]) size = int(fields[3]) self.block_info.append((offset, compressed_size, size)) self.nblocks = len(self.block_info) def close(self): self.file.close() def load_block(self, index): if self.cache is not None and index in self.cache: return self.cache[index] else: offset, csize, size = self.block_info[index] # Get the block of compressed data self.file.seek(offset) data = self.file.read(csize) # Need to prepend a header for python-lzo module (silly) data = ''.join(('\xf0', struct.pack("!I", size), data)) value = lzo.decompress(data) if self.cache is not None: self.cache[index] = value return value def fix_dirty(self): chunk, offset = self.get_block_and_offset(self.file_pos) if self.current_block_index != chunk: self.current_block = StringIO(self.load_block(chunk)) self.current_block.read(offset) self.current_block_index = chunk else: self.current_block.seek(offset) self.dirty = False def get_block_and_offset(self, index): return int(index // self.block_size), int(index % self.block_size) def seek(self, offset, whence=0): """ Move the file pointer to a particular offset. """ # Determine absolute target position if whence == 0: target_pos = offset elif whence == 1: target_pos = self.file_pos + offset elif whence == 2: raise Exception("seek from end not supported") ## target_pos = self.size - offset else: raise Exception("Invalid `whence` argument: %r", whence) # Check if this is a noop if target_pos == self.file_pos: return # Verify it is valid ## assert 0 <= target_pos < self.size, "Attempt to seek outside file" # Move the position self.file_pos = target_pos # Mark as dirty, the next time a read is done we need to actually # move the position in the bzip2 file self.dirty = True def tell(self): return self.file_pos def readline(self): if self.dirty: self.fix_dirty() if self.at_eof: return "" rval = [] while 1: line = self.current_block.readline() self.file_pos += len(line) rval.append(line) if len(line) > 0 and line[-1] == '\n': break elif self.current_block_index == self.nblocks - 1: self.at_eof = True break else: self.current_block_index += 1 self.current_block = StringIO( self.load_block(self.current_block_index)) return "".join(rval) def __next__(self): line = self.readline() if line == "": raise StopIteration def __iter__(self): return self
class SeekableLzopFile( Iterator ): """ Filelike object supporting read-only semi-random access to bz2 compressed files for which an offset table (bz2t) has been generated by `bzip-table`. """ def __init__( self, filename, table_filename, block_cache_size=0, **kwargs ): self.filename = filename self.table_filename = table_filename self.init_table() self.file = open( self.filename, "r" ) self.dirty = True self.at_eof = False self.file_pos = 0 self.current_block_index = -1 self.current_block = None if block_cache_size > 0: self.cache = lrucache.LRUCache( block_cache_size ) else: self.cache = None def init_table( self ): self.block_size = None self.block_info = [] # Position of corresponding block in compressed file (in bytes) for line in open( self.table_filename ): fields = line.split() if fields[0] == "s": self.block_size = int( fields[1] ) if fields[0] == "o": offset = int( fields[1] ) compressed_size = int( fields[2] ) size = int( fields[3] ) self.block_info.append( ( offset, compressed_size, size ) ) self.nblocks = len( self.block_info ) def close( self ): self.file.close() def load_block( self, index ): if self.cache is not None and index in self.cache: return self.cache[index] else: offset, csize, size = self.block_info[ index ] # Get the block of compressed data self.file.seek( offset ) data = self.file.read( csize ) # Need to prepend a header for python-lzo module (silly) data = ''.join( ( '\xf0', struct.pack( "!I", size ), data ) ) value = lzo.decompress( data ) if self.cache is not None: self.cache[index] = value return value def fix_dirty( self ): chunk, offset = self.get_block_and_offset( self.file_pos ) if self.current_block_index != chunk: self.current_block = StringIO( self.load_block( chunk ) ) self.current_block.read( offset ) self.current_block_index = chunk else: self.current_block.seek( offset ) self.dirty = False def get_block_and_offset( self, index ): return int( index // self.block_size ), int( index % self.block_size ) def seek( self, offset, whence=0 ): """ Move the file pointer to a particular offset. """ # Determine absolute target position if whence == 0: target_pos = offset elif whence == 1: target_pos = self.file_pos + offset elif whence == 2: raise Exception( "seek from end not supported" ) ## target_pos = self.size - offset else: raise Exception( "Invalid `whence` argument: %r", whence ) # Check if this is a noop if target_pos == self.file_pos: return # Verify it is valid ## assert 0 <= target_pos < self.size, "Attempt to seek outside file" # Move the position self.file_pos = target_pos # Mark as dirty, the next time a read is done we need to actually # move the position in the bzip2 file self.dirty = True def tell( self ): return self.file_pos def readline( self ): if self.dirty: self.fix_dirty() if self.at_eof: return "" rval = [] while 1: line = self.current_block.readline() self.file_pos += len( line ) rval.append( line ) if len( line ) > 0 and line[-1] == '\n': break elif self.current_block_index == self.nblocks - 1: self.at_eof = True break else: self.current_block_index += 1 self.current_block = StringIO( self.load_block( self.current_block_index ) ) return "".join( rval ) def __next__( self ): line = self.readline() if line == "": raise StopIteration def __iter__( self ): return self