Ejemplo n.º 1
0
    def create_file(self, name):
        def onclose_fn(sfile):
            sfile.file.close()

        f = StructFile(DatastoreFile(key_name="%s%s" % (self.name, name)),
                       name=name,
                       onclose=onclose_fn)
        return f
Ejemplo n.º 2
0
    def __init__(self,
                 table_file,
                 blocksize=16 * 1024,
                 compressed=0,
                 prefixcoding=False,
                 postings=False,
                 stringids=False,
                 checksize=True):
        self.table_file = table_file
        self.blocksize = blocksize

        if compressed > 0 and not has_zlib:
            raise Exception("zlib is not available: cannot compress table")
        self.compressed = compressed
        self.prefixcoding = prefixcoding

        self.haspostings = postings
        if postings:
            self.offset = 0
            self.postcount = 0
            self.lastpostid = None
            self.stringids = stringids
            self.posting_file = StructFile(tempfile.TemporaryFile())

        self.rowbuffer = []
        self.lastkey = None
        self.blockfilled = 0

        self.dir = []

        # Remember where we started writing
        self.start = table_file.tell()
        # Save space for a pointer to the directory
        table_file.write_ulong(0)
        # Save space for a pointer to the postings
        table_file.write_ulong(0)

        self.options = {
            "haspostings": postings,
            "compressed": compressed,
            "prefixcoding": prefixcoding,
            "stringids": stringids
        }
Ejemplo n.º 3
0
 def __init__(self, table_file, blocksize = 16 * 1024,
              compressed = 0, prefixcoding = False,
              postings = False, stringids = False,
              checksize = True):
     self.table_file = table_file
     self.blocksize = blocksize
     
     if compressed > 0 and not has_zlib:
         raise Exception("zlib is not available: cannot compress table")
     self.compressed = compressed
     self.prefixcoding = prefixcoding
     
     self.haspostings = postings
     if postings:
         self.offset = 0
         self.postcount = 0
         self.lastpostid = None
         self.stringids = stringids
         self.posting_file = StructFile(tempfile.TemporaryFile())
     
     self.rowbuffer = []
     self.lastkey = None
     self.blockfilled = 0
     
     self.keys = []
     self.pointers = array("L")
     
     # Remember where we started writing
     self.start = table_file.tell()
     # Save space for a pointer to the directory
     table_file.write_ulong(0)
     # Save space for a pointer to the postings
     table_file.write_ulong(0)
     
     self.options = {"haspostings": postings,
                     "compressed": compressed,
                     "prefixcoding": prefixcoding,
                     "stringids": stringids}
Ejemplo n.º 4
0
 def open_file(self, name):
     if name not in self.files:
         raise NameError
     return StructFile(StringIO(self.files[name]))
Ejemplo n.º 5
0
    def create_file(self, name):
        def onclose_fn(sfile):
            self.files[name] = sfile.file.getvalue()

        f = StructFile(StringIO(), name=name, onclose=onclose_fn)
        return f
Ejemplo n.º 6
0
 def open_file(self, name, compressed=False):
     f = StructFile(open(self._fpath(name), "rb"))
     f._name = name
     return f
Ejemplo n.º 7
0
 def create_file(self, name):
     f = StructFile(open(self._fpath(name), "wb"))
     f._name = name
     return f
Ejemplo n.º 8
0
 def open_file(self, name, compressed = False):
     f = StructFile(open(self._fpath(name), "rb"))
     f._name = name
     return f
Ejemplo n.º 9
0
 def create_file(self, name):
     f = StructFile(open(self._fpath(name), "wb"))
     f._name = name
     return f
Ejemplo n.º 10
0
 def open_file(self, name):
     return StructFile(DatastoreFile.loadfile("%s%s" % (self.name, name)))
Ejemplo n.º 11
0
class TableWriter(object):
    def __init__(self, table_file, blocksize = 16 * 1024,
                 compressed = 0, prefixcoding = False,
                 postings = False, stringids = False,
                 checksize = True):
        self.table_file = table_file
        self.blocksize = blocksize
        
        if compressed > 0 and not has_zlib:
            raise Exception("zlib is not available: cannot compress table")
        self.compressed = compressed
        self.prefixcoding = prefixcoding
        
        self.haspostings = postings
        if postings:
            self.offset = 0
            self.postcount = 0
            self.lastpostid = None
            self.stringids = stringids
            self.posting_file = StructFile(tempfile.TemporaryFile())
        
        self.rowbuffer = []
        self.lastkey = None
        self.blockfilled = 0
        
        self.keys = []
        self.pointers = array("L")
        
        # Remember where we started writing
        self.start = table_file.tell()
        # Save space for a pointer to the directory
        table_file.write_ulong(0)
        # Save space for a pointer to the postings
        table_file.write_ulong(0)
        
        self.options = {"haspostings": postings,
                        "compressed": compressed,
                        "prefixcoding": prefixcoding,
                        "stringids": stringids}
    
    def close(self):
        # If there is still a block waiting to be written, flush it out
        if self.rowbuffer:
            self._write_block()
        
        tf = self.table_file
        haspostings = self.haspostings
        
        # Remember where we started writing the directory
        dirpos = tf.tell()
        # Write the directory
        tf.write_pickle(self.keys)
        tf.write_array(self.pointers)
        tf.write_pickle(self.options)
        
        if haspostings:
            # Remember where we started the postings
            postpos = tf.tell()
            # Seek back to the beginning of the postings and
            # copy them onto the end of the table file.
            self.posting_file.seek(0)
            shutil.copyfileobj(self.posting_file, tf)
            self.posting_file.close()
        
        # Seek back to where we started writing and write a
        # pointer to the directory
        tf.seek(self.start)
        tf.write_ulong(dirpos)
        
        if haspostings:
            # Write a pointer to the postings
            tf.write_ulong(postpos)
        
        tf.close()
    
    def _write_block(self):
        buf = self.rowbuffer
        key = buf[0][0]
        compressed = self.compressed
        
        self.keys.append(key)
        self.pointers.append(self.table_file.tell())
        if compressed:
            pck = dumps(buf)
            self.table_file.write_string(compress(pck, compressed))
        else:
            self.table_file.write_pickle(buf)
        
        self.rowbuffer = []
        self.blockfilled = 0
    
    def write_posting(self, id, data, writefn):
        # IDs must be added in increasing order
        if id <= self.lastpostid:
            raise IndexError("IDs must increase: %r..%r" % (self.lastpostid, id))
        
        pf = self.posting_file
        if self.stringids:
            pf.write_string(id.encode("utf8"))
        else:
            lastpostid = self.lastpostid or 0
            pf.write_varint(id - lastpostid)
        
        self.lastpostid = id
        self.postcount += 1
        
        return writefn(pf, data)
    
    def add_row(self, key, data):
        # Note: call this AFTER you add any postings!
        # Keys must be added in increasing order
        if key <= self.lastkey:
            raise IndexError("Keys must increase: %r..%r" % (self.lastkey, key))
        
        rb = self.rowbuffer
        
        if isinstance(data, array):
            self.blockfilled += len(data) * data.itemsize
        else:
            # Ugh! We're pickling twice! At least it's fast.
            self.blockfilled += len(dumps(data))
        self.lastkey = key
        
        if self.haspostings:
            # Add the posting info to the stored row data
            endoffset = self.posting_file.tell()
            length = endoffset - self.offset
            rb.append((key, (self.offset, length, self.postcount, data)))
            
            # Reset the posting variables
            self.offset = endoffset
            self.postcount = 0
            self.lastpostid = None
        else:
            rb.append((key, data))
        
        # If this row filled up a block, flush it out
        if self.blockfilled >= self.blocksize:
            #print len(rb)
            self._write_block()
Ejemplo n.º 12
0
class TableWriter(object):
    def __init__(self,
                 table_file,
                 blocksize=16 * 1024,
                 compressed=0,
                 prefixcoding=False,
                 postings=False,
                 stringids=False,
                 checksize=True):
        self.table_file = table_file
        self.blocksize = blocksize

        if compressed > 0 and not has_zlib:
            raise Exception("zlib is not available: cannot compress table")
        self.compressed = compressed
        self.prefixcoding = prefixcoding

        self.haspostings = postings
        if postings:
            self.offset = 0
            self.postcount = 0
            self.lastpostid = None
            self.stringids = stringids
            self.posting_file = StructFile(tempfile.TemporaryFile())

        self.rowbuffer = []
        self.lastkey = None
        self.blockfilled = 0

        self.dir = []

        # Remember where we started writing
        self.start = table_file.tell()
        # Save space for a pointer to the directory
        table_file.write_ulong(0)
        # Save space for a pointer to the postings
        table_file.write_ulong(0)

        self.options = {
            "haspostings": postings,
            "compressed": compressed,
            "prefixcoding": prefixcoding,
            "stringids": stringids
        }

    def close(self):
        # If there is still a block waiting to be written, flush it out
        if self.rowbuffer:
            self._write_block()

        tf = self.table_file
        haspostings = self.haspostings

        # Remember where we started writing the directory
        dirpos = tf.tell()
        # Write the directory
        tf.write_pickle((tuple(self.dir), self.options))

        if haspostings:
            # Remember where we started the postings
            postpos = tf.tell()
            # Seek back to the beginning of the postings and
            # copy them onto the end of the table file.
            self.posting_file.seek(0)
            shutil.copyfileobj(self.posting_file, tf)
            self.posting_file.close()

        # Seek back to where we started writing and write a
        # pointer to the directory
        tf.seek(self.start)
        tf.write_ulong(dirpos)

        if haspostings:
            # Write a pointer to the postings
            tf.write_ulong(postpos)

        tf.close()

    def _write_block(self):
        buf = self.rowbuffer
        key = buf[0][0]
        compressed = self.compressed

        self.dir.append((key, self.table_file.tell()))
        if compressed:
            pck = dump_pickle_str(buf, -1)
            self.table_file.write_string(compress(pck, compressed))
        else:
            self.table_file.write_pickle(buf)

        self.rowbuffer = []
        self.blockfilled = 0

    def write_posting(self, id, data, writefn):
        # IDs must be added in increasing order
        if id <= self.lastpostid:
            raise IndexError("IDs must increase: %r..%r" %
                             (self.lastpostid, id))

        pf = self.posting_file
        if self.stringids:
            pf.write_string(id.encode("utf8"))
        else:
            lastpostid = self.lastpostid or 0
            pf.write_varint(id - lastpostid)

        self.lastpostid = id
        self.postcount += 1

        return writefn(pf, data)

    def add_row(self, key, data):
        # Note: call this AFTER you add any postings!
        # Keys must be added in increasing order
        if key <= self.lastkey:
            raise IndexError("Keys must increase: %r..%r" %
                             (self.lastkey, key))

        rb = self.rowbuffer

        if isinstance(data, array):
            self.blockfilled += len(data) * data.itemsize
        else:
            # Ugh! We're pickling twice! At least it's fast.
            self.blockfilled += len(dump_pickle_str(data, -1))
        self.lastkey = key

        if self.haspostings:
            # Add the posting info to the stored row data
            endoffset = self.posting_file.tell()
            length = endoffset - self.offset
            rb.append((key, (self.offset, length, self.postcount, data)))

            # Reset the posting variables
            self.offset = endoffset
            self.postcount = 0
            self.lastpostid = None
        else:
            rb.append((key, data))

        # If this row filled up a block, flush it out
        if self.blockfilled >= self.blocksize:
            self._write_block()