def create_file(self, name): def onclose_fn(sfile): sfile.file.close() f = StructFile(DatastoreFile(key_name="%s%s" % (self.name, name)), name=name, onclose=onclose_fn) return f
def __init__(self, table_file, blocksize=16 * 1024, compressed=0, prefixcoding=False, postings=False, stringids=False, checksize=True): self.table_file = table_file self.blocksize = blocksize if compressed > 0 and not has_zlib: raise Exception("zlib is not available: cannot compress table") self.compressed = compressed self.prefixcoding = prefixcoding self.haspostings = postings if postings: self.offset = 0 self.postcount = 0 self.lastpostid = None self.stringids = stringids self.posting_file = StructFile(tempfile.TemporaryFile()) self.rowbuffer = [] self.lastkey = None self.blockfilled = 0 self.dir = [] # Remember where we started writing self.start = table_file.tell() # Save space for a pointer to the directory table_file.write_ulong(0) # Save space for a pointer to the postings table_file.write_ulong(0) self.options = { "haspostings": postings, "compressed": compressed, "prefixcoding": prefixcoding, "stringids": stringids }
def __init__(self, table_file, blocksize = 16 * 1024, compressed = 0, prefixcoding = False, postings = False, stringids = False, checksize = True): self.table_file = table_file self.blocksize = blocksize if compressed > 0 and not has_zlib: raise Exception("zlib is not available: cannot compress table") self.compressed = compressed self.prefixcoding = prefixcoding self.haspostings = postings if postings: self.offset = 0 self.postcount = 0 self.lastpostid = None self.stringids = stringids self.posting_file = StructFile(tempfile.TemporaryFile()) self.rowbuffer = [] self.lastkey = None self.blockfilled = 0 self.keys = [] self.pointers = array("L") # Remember where we started writing self.start = table_file.tell() # Save space for a pointer to the directory table_file.write_ulong(0) # Save space for a pointer to the postings table_file.write_ulong(0) self.options = {"haspostings": postings, "compressed": compressed, "prefixcoding": prefixcoding, "stringids": stringids}
def open_file(self, name): if name not in self.files: raise NameError return StructFile(StringIO(self.files[name]))
def create_file(self, name): def onclose_fn(sfile): self.files[name] = sfile.file.getvalue() f = StructFile(StringIO(), name=name, onclose=onclose_fn) return f
def open_file(self, name, compressed=False): f = StructFile(open(self._fpath(name), "rb")) f._name = name return f
def create_file(self, name): f = StructFile(open(self._fpath(name), "wb")) f._name = name return f
def open_file(self, name, compressed = False): f = StructFile(open(self._fpath(name), "rb")) f._name = name return f
def open_file(self, name): return StructFile(DatastoreFile.loadfile("%s%s" % (self.name, name)))
class TableWriter(object): def __init__(self, table_file, blocksize = 16 * 1024, compressed = 0, prefixcoding = False, postings = False, stringids = False, checksize = True): self.table_file = table_file self.blocksize = blocksize if compressed > 0 and not has_zlib: raise Exception("zlib is not available: cannot compress table") self.compressed = compressed self.prefixcoding = prefixcoding self.haspostings = postings if postings: self.offset = 0 self.postcount = 0 self.lastpostid = None self.stringids = stringids self.posting_file = StructFile(tempfile.TemporaryFile()) self.rowbuffer = [] self.lastkey = None self.blockfilled = 0 self.keys = [] self.pointers = array("L") # Remember where we started writing self.start = table_file.tell() # Save space for a pointer to the directory table_file.write_ulong(0) # Save space for a pointer to the postings table_file.write_ulong(0) self.options = {"haspostings": postings, "compressed": compressed, "prefixcoding": prefixcoding, "stringids": stringids} def close(self): # If there is still a block waiting to be written, flush it out if self.rowbuffer: self._write_block() tf = self.table_file haspostings = self.haspostings # Remember where we started writing the directory dirpos = tf.tell() # Write the directory tf.write_pickle(self.keys) tf.write_array(self.pointers) tf.write_pickle(self.options) if haspostings: # Remember where we started the postings postpos = tf.tell() # Seek back to the beginning of the postings and # copy them onto the end of the table file. self.posting_file.seek(0) shutil.copyfileobj(self.posting_file, tf) self.posting_file.close() # Seek back to where we started writing and write a # pointer to the directory tf.seek(self.start) tf.write_ulong(dirpos) if haspostings: # Write a pointer to the postings tf.write_ulong(postpos) tf.close() def _write_block(self): buf = self.rowbuffer key = buf[0][0] compressed = self.compressed self.keys.append(key) self.pointers.append(self.table_file.tell()) if compressed: pck = dumps(buf) self.table_file.write_string(compress(pck, compressed)) else: self.table_file.write_pickle(buf) self.rowbuffer = [] self.blockfilled = 0 def write_posting(self, id, data, writefn): # IDs must be added in increasing order if id <= self.lastpostid: raise IndexError("IDs must increase: %r..%r" % (self.lastpostid, id)) pf = self.posting_file if self.stringids: pf.write_string(id.encode("utf8")) else: lastpostid = self.lastpostid or 0 pf.write_varint(id - lastpostid) self.lastpostid = id self.postcount += 1 return writefn(pf, data) def add_row(self, key, data): # Note: call this AFTER you add any postings! # Keys must be added in increasing order if key <= self.lastkey: raise IndexError("Keys must increase: %r..%r" % (self.lastkey, key)) rb = self.rowbuffer if isinstance(data, array): self.blockfilled += len(data) * data.itemsize else: # Ugh! We're pickling twice! At least it's fast. self.blockfilled += len(dumps(data)) self.lastkey = key if self.haspostings: # Add the posting info to the stored row data endoffset = self.posting_file.tell() length = endoffset - self.offset rb.append((key, (self.offset, length, self.postcount, data))) # Reset the posting variables self.offset = endoffset self.postcount = 0 self.lastpostid = None else: rb.append((key, data)) # If this row filled up a block, flush it out if self.blockfilled >= self.blocksize: #print len(rb) self._write_block()
class TableWriter(object): def __init__(self, table_file, blocksize=16 * 1024, compressed=0, prefixcoding=False, postings=False, stringids=False, checksize=True): self.table_file = table_file self.blocksize = blocksize if compressed > 0 and not has_zlib: raise Exception("zlib is not available: cannot compress table") self.compressed = compressed self.prefixcoding = prefixcoding self.haspostings = postings if postings: self.offset = 0 self.postcount = 0 self.lastpostid = None self.stringids = stringids self.posting_file = StructFile(tempfile.TemporaryFile()) self.rowbuffer = [] self.lastkey = None self.blockfilled = 0 self.dir = [] # Remember where we started writing self.start = table_file.tell() # Save space for a pointer to the directory table_file.write_ulong(0) # Save space for a pointer to the postings table_file.write_ulong(0) self.options = { "haspostings": postings, "compressed": compressed, "prefixcoding": prefixcoding, "stringids": stringids } def close(self): # If there is still a block waiting to be written, flush it out if self.rowbuffer: self._write_block() tf = self.table_file haspostings = self.haspostings # Remember where we started writing the directory dirpos = tf.tell() # Write the directory tf.write_pickle((tuple(self.dir), self.options)) if haspostings: # Remember where we started the postings postpos = tf.tell() # Seek back to the beginning of the postings and # copy them onto the end of the table file. self.posting_file.seek(0) shutil.copyfileobj(self.posting_file, tf) self.posting_file.close() # Seek back to where we started writing and write a # pointer to the directory tf.seek(self.start) tf.write_ulong(dirpos) if haspostings: # Write a pointer to the postings tf.write_ulong(postpos) tf.close() def _write_block(self): buf = self.rowbuffer key = buf[0][0] compressed = self.compressed self.dir.append((key, self.table_file.tell())) if compressed: pck = dump_pickle_str(buf, -1) self.table_file.write_string(compress(pck, compressed)) else: self.table_file.write_pickle(buf) self.rowbuffer = [] self.blockfilled = 0 def write_posting(self, id, data, writefn): # IDs must be added in increasing order if id <= self.lastpostid: raise IndexError("IDs must increase: %r..%r" % (self.lastpostid, id)) pf = self.posting_file if self.stringids: pf.write_string(id.encode("utf8")) else: lastpostid = self.lastpostid or 0 pf.write_varint(id - lastpostid) self.lastpostid = id self.postcount += 1 return writefn(pf, data) def add_row(self, key, data): # Note: call this AFTER you add any postings! # Keys must be added in increasing order if key <= self.lastkey: raise IndexError("Keys must increase: %r..%r" % (self.lastkey, key)) rb = self.rowbuffer if isinstance(data, array): self.blockfilled += len(data) * data.itemsize else: # Ugh! We're pickling twice! At least it's fast. self.blockfilled += len(dump_pickle_str(data, -1)) self.lastkey = key if self.haspostings: # Add the posting info to the stored row data endoffset = self.posting_file.tell() length = endoffset - self.offset rb.append((key, (self.offset, length, self.postcount, data))) # Reset the posting variables self.offset = endoffset self.postcount = 0 self.lastpostid = None else: rb.append((key, data)) # If this row filled up a block, flush it out if self.blockfilled >= self.blocksize: self._write_block()