class FieldedOrderedHashWriter(HashWriter):
    """Implements an on-disk hash, but writes separate position indexes for
    each field.
    """

    def __init__(self, dbfile):
        HashWriter.__init__(self, dbfile)
        # Map field names to (startpos, indexpos, length, typecode)
        self.fieldmap = self.extras["fieldmap"] = {}

        # Keep track of the last key added
        self.lastkey = emptybytes

    def start_field(self, fieldname):
        self.fieldstart = self.dbfile.tell()
        self.fieldname = fieldname
        # Keep an array of the positions of all keys
        self.poses = GrowableArray("H")
        self.lastkey = emptybytes

    def add(self, key, value):
        if key <= self.lastkey:
            raise ValueError("Keys must increase: %r..%r"
                             % (self.lastkey, key))
        self.poses.append(self.dbfile.tell() - self.fieldstart)
        HashWriter.add(self, key, value)
        self.lastkey = key

    def end_field(self):
        dbfile = self.dbfile
        fieldname = self.fieldname
        poses = self.poses
        self.fieldmap[fieldname] = (self.fieldstart, dbfile.tell(), len(poses),
                                    poses.typecode)
        poses.to_file(dbfile)
    class Writer(ColumnWriter):
        def __init__(self, dbfile):
            assert isinstance(dbfile, StructFile)
            self._dbfile = dbfile
            self._count = 0
            self._lengths = GrowableArray(allow_longs=False)

        def __repr__(self):
            return "<VarBytes.Writer>"

        def fill(self, docnum):
            if docnum > self._count:
                self._lengths.extend(0 for _ in xrange(docnum - self._count))

        def add(self, docnum, v):
            self.fill(docnum)
            self._dbfile.write(v)
            self._lengths.append(len(v))
            self._count = docnum + 1

        def finish(self, doccount):
            self.fill(doccount)
            lengths = self._lengths.array

            self._dbfile.write_array(lengths)
            # Write the typecode for the lengths
            self._dbfile.write_byte(ord(lengths.typecode))
Beispiel #3
0
class OrderedHashWriter(HashWriter):
    """Implements an on-disk hash, but requires that keys be added in order.
    An :class:`OrderedHashReader` can then look up "nearest keys" based on
    the ordering.
    """

    def __init__(self, dbfile):
        HashWriter.__init__(self, dbfile)
        # Keep an array of the positions of all keys
        self.index = GrowableArray("H")
        # Keep track of the last key added
        self.lastkey = emptybytes

    def add(self, key, value):
        if key <= self.lastkey:
            raise ValueError("Keys must increase: %r..%r"
                             % (self.lastkey, key))
        self.index.append(self.dbfile.tell())
        HashWriter.add(self, key, value)
        self.lastkey = key

    def _write_extras(self):
        dbfile = self.dbfile
        index = self.index

        # Store metadata about the index array
        self.extras["indextype"] = index.typecode
        self.extras["indexlen"] = len(index)
        # Write the extras
        HashWriter._write_extras(self)
        # Write the index array
        index.to_file(dbfile)
Beispiel #4
0
 def __init__(self, dbfile, allow_offsets=True, cutoff=2**15):
     assert isinstance(dbfile, StructFile)
     self._dbfile = dbfile
     self._count = 0
     self._lengths = GrowableArray(allow_longs=False)
     self._offsets = GrowableArray(allow_longs=False)
     self._offset_base = 0
     self.allow_offsets = allow_offsets
     self.cutoff = cutoff
Beispiel #5
0
    class Writer(ColumnWriter):
        def __init__(self, dbfile, allow_offsets=True, cutoff=2**15):
            assert isinstance(dbfile, StructFile)
            self._dbfile = dbfile
            self._count = 0
            self._lengths = GrowableArray(allow_longs=False)
            self._offsets = GrowableArray(allow_longs=False)
            self._offset_base = 0
            self.allow_offsets = allow_offsets
            self.cutoff = cutoff

        def __repr__(self):
            return "<VarBytes.Writer>"

        def fill(self, docnum):
            base = self._offset_base
            if docnum > self._count:
                self._lengths.extend(0 for _ in xrange(docnum - self._count))
                self._offsets.extend(base for _ in xrange(docnum - self._count))

        def add(self, docnum, v):
            self.fill(docnum)
            self._dbfile.write(v)
            self._lengths.append(len(v))
            self._offsets.append(self._offset_base)
            self._offset_base += len(v)
            self._count = docnum + 1

        def finish(self, doccount):
            dbfile = self._dbfile
            lengths = self._lengths.array
            offsets = self._offsets.array
            self.fill(doccount)

            dbfile.write_array(lengths)

            # Only write the offsets if there is a large number of items in the
            # column, otherwise it's fast enough to derive them from the lens
            write_offsets = self.allow_offsets and doccount > self.cutoff
            if write_offsets:
                dbfile.write_array(offsets)

            # Backwards compatibility: previous versions only wrote the lengths,
            # and the last byte of the column was the lengths type code...
            dbfile.write(lengths.typecode.encode("ascii"))
            # ...but if we wrote offsets, make the last byte "X" so we know
            if write_offsets:
                dbfile.write(offsets.typecode.encode("ascii"))
                dbfile.write("X".encode("ascii"))
 def __init__(self, dbfile):
     assert isinstance(dbfile, StructFile)
     self._dbfile = dbfile
     self._count = 0
     self._lengths = GrowableArray(allow_longs=False)
Beispiel #7
0
 def __init__(self, dbfile):
     HashWriter.__init__(self, dbfile)
     # Keep an array of the positions of all keys
     self.index = GrowableArray("H")
     # Keep track of the last key added
     self.lastkey = emptybytes
 def start_field(self, fieldname):
     self.fieldstart = self.dbfile.tell()
     self.fieldname = fieldname
     # Keep an array of the positions of all keys
     self.poses = GrowableArray("H")
     self.lastkey = emptybytes
Beispiel #9
0
 def __init__(self, dbfile):
     assert isinstance(dbfile, StructFile)
     self._dbfile = dbfile
     self._count = 0
     self._lengths = GrowableArray(allow_longs=False)
Beispiel #10
0
 def __init__(self, child, fixedlen):
     self._child = child
     self._fixedlen = fixedlen
     self._lengths = GrowableArray()
     self._count = 0
Beispiel #11
0
 def start_field(self, fieldname):
     self.fieldstart = self.dbfile.tell()
     self.fieldname = fieldname
     # Keep an array of the positions of all keys
     self.poses = GrowableArray("H")
     self.lastkey = emptybytes
Beispiel #12
0
 def __init__(self, dbfile):
     HashWriter.__init__(self, dbfile)
     # Keep an array of the positions of all keys
     self.index = GrowableArray("H")
     # Keep track of the last key added
     self.lastkey = emptybytes