class Writer(ColumnWriter):
        def __init__(self, dbfile):
            assert isinstance(dbfile, StructFile)
            self._dbfile = dbfile
            self._count = 0
            self._lengths = GrowableArray(allow_longs=False)

        def __repr__(self):
            return "<VarBytes.Writer>"

        def fill(self, docnum):
            if docnum > self._count:
                self._lengths.extend(0 for _ in xrange(docnum - self._count))

        def add(self, docnum, v):
            self.fill(docnum)
            self._dbfile.write(v)
            self._lengths.append(len(v))
            self._count = docnum + 1

        def finish(self, doccount):
            self.fill(doccount)
            lengths = self._lengths.array

            self._dbfile.write_array(lengths)
            # Write the typecode for the lengths
            self._dbfile.write_byte(ord(lengths.typecode))
Example #2
0
class FieldedOrderedHashWriter(HashWriter):
    """Implements an on-disk hash, but writes separate position indexes for
    each field.
    """
    def __init__(self, dbfile):
        HashWriter.__init__(self, dbfile)
        # Map field names to (startpos, indexpos, length, typecode)
        self.fieldmap = self.extras["fieldmap"] = {}

        # Keep track of the last key added
        self.lastkey = emptybytes

    def start_field(self, fieldname):
        self.fieldstart = self.dbfile.tell()
        self.fieldname = fieldname
        # Keep an array of the positions of all keys
        self.poses = GrowableArray("H")
        self.lastkey = emptybytes

    def add(self, key, value):
        if key <= self.lastkey:
            raise ValueError("Keys must increase: %r..%r" %
                             (self.lastkey, key))
        self.poses.append(self.dbfile.tell() - self.fieldstart)
        HashWriter.add(self, key, value)
        self.lastkey = key

    def end_field(self):
        dbfile = self.dbfile
        fieldname = self.fieldname
        poses = self.poses
        self.fieldmap[fieldname] = (self.fieldstart, dbfile.tell(), len(poses),
                                    poses.typecode)
        poses.to_file(dbfile)
Example #3
0
class OrderedHashWriter(HashWriter):
    """Implements an on-disk hash, but requires that keys be added in order.
    An :class:`OrderedHashReader` can then look up "nearest keys" based on
    the ordering.
    """

    def __init__(self, dbfile):
        HashWriter.__init__(self, dbfile)
        # Keep an array of the positions of all keys
        self.index = GrowableArray("H")
        # Keep track of the last key added
        self.lastkey = emptybytes

    def add(self, key, value):
        if key <= self.lastkey:
            raise ValueError("Keys must increase: %r..%r"
                             % (self.lastkey, key))
        self.index.append(self.dbfile.tell())
        HashWriter.add(self, key, value)
        self.lastkey = key

    def _write_extras(self):
        dbfile = self.dbfile
        index = self.index

        # Store metadata about the index array
        self.extras["indextype"] = index.typecode
        self.extras["indexlen"] = len(index)
        # Write the extras
        HashWriter._write_extras(self)
        # Write the index array
        index.to_file(dbfile)
Example #4
0
class OrderedHashWriter(HashWriter):
    """Implements an on-disk hash, but requires that keys be added in order.
    An :class:`OrderedHashReader` can then look up "nearest keys" based on
    the ordering.
    """
    def __init__(self, dbfile):
        HashWriter.__init__(self, dbfile)
        # Keep an array of the positions of all keys
        self.index = GrowableArray("H")
        # Keep track of the last key added
        self.lastkey = emptybytes

    def add(self, key, value):
        if key <= self.lastkey:
            raise ValueError("Keys must increase: %r..%r" %
                             (self.lastkey, key))
        self.index.append(self.dbfile.tell())
        HashWriter.add(self, key, value)
        self.lastkey = key

    def _write_extras(self):
        dbfile = self.dbfile
        index = self.index

        # Store metadata about the index array
        self.extras["indextype"] = index.typecode
        self.extras["indexlen"] = len(index)
        # Write the extras
        HashWriter._write_extras(self)
        # Write the index array
        index.to_file(dbfile)
class FieldedOrderedHashWriter(HashWriter):
    """Implements an on-disk hash, but writes separate position indexes for
    each field.
    """

    def __init__(self, dbfile):
        HashWriter.__init__(self, dbfile)
        # Map field names to (startpos, indexpos, length, typecode)
        self.fieldmap = self.extras["fieldmap"] = {}

        # Keep track of the last key added
        self.lastkey = emptybytes

    def start_field(self, fieldname):
        self.fieldstart = self.dbfile.tell()
        self.fieldname = fieldname
        # Keep an array of the positions of all keys
        self.poses = GrowableArray("H")
        self.lastkey = emptybytes

    def add(self, key, value):
        if key <= self.lastkey:
            raise ValueError("Keys must increase: %r..%r"
                             % (self.lastkey, key))
        self.poses.append(self.dbfile.tell() - self.fieldstart)
        HashWriter.add(self, key, value)
        self.lastkey = key

    def end_field(self):
        dbfile = self.dbfile
        fieldname = self.fieldname
        poses = self.poses
        self.fieldmap[fieldname] = (self.fieldstart, dbfile.tell(), len(poses),
                                    poses.typecode)
        poses.to_file(dbfile)
Example #6
0
    class Writer(ColumnWriter):
        def __init__(self, dbfile):
            assert isinstance(dbfile, StructFile)
            self._dbfile = dbfile
            self._count = 0
            self._lengths = GrowableArray(allow_longs=False)

        def __repr__(self):
            return "<VarBytes.Writer>"

        def fill(self, docnum):
            if docnum > self._count:
                self._lengths.extend(0 for _ in xrange(docnum - self._count))

        def add(self, docnum, v):
            self.fill(docnum)
            self._dbfile.write(v)
            self._lengths.append(len(v))
            self._count = docnum + 1

        def finish(self, doccount):
            self.fill(doccount)
            lengths = self._lengths.array

            self._dbfile.write_array(lengths)
            # Write the typecode for the lengths
            self._dbfile.write_byte(ord(lengths.typecode))
Example #7
0
    class Writer(ColumnWriter):
        def __init__(self, dbfile, allow_offsets=True, cutoff=2**15):
            assert isinstance(dbfile, StructFile)
            self._dbfile = dbfile
            self._count = 0
            self._lengths = GrowableArray(allow_longs=False)
            self._offsets = GrowableArray(allow_longs=False)
            self._offset_base = 0
            self.allow_offsets = allow_offsets
            self.cutoff = cutoff

        def __repr__(self):
            return "<VarBytes.Writer>"

        def fill(self, docinfo):
            docnum, docbase = docinfo
            base = self._offset_base
            if docnum - docbase > self._count:
                self._lengths.extend(
                    0 for _ in xrange((docnum - docbase) - self._count))
                self._offsets.extend(
                    base for _ in xrange((docnum - docbase) - self._count))

        def add(self, docinfo, v):
            docnum, docbase = docinfo
            self.fill(docinfo)
            self._dbfile.write(v)
            self._lengths.append(len(v))
            self._offsets.append(self._offset_base)
            self._offset_base += len(v)
            self._count = (docnum - docbase) + 1

        def finish(self, docinfo):
            docnum, docbase = docinfo
            dbfile = self._dbfile
            lengths = self._lengths.array
            offsets = self._offsets.array
            self.fill(docinfo)

            dbfile.write_array(lengths)

            # Only write the offsets if there is a large number of items in the
            # column, otherwise it's fast enough to derive them from the lens
            write_offsets = (self.allow_offsets
                             and docnum - docbase > self.cutoff)
            if write_offsets:
                dbfile.write_array(offsets)

            # Backwards compatibility: previous versions only wrote the lengths,
            # and the last byte of the column was the lengths type code...
            dbfile.write(lengths.typecode.encode("ascii"))
            # ...but if we wrote offsets, make the last byte "X" so we know
            if write_offsets:
                dbfile.write(offsets.typecode.encode("ascii"))
                dbfile.write("X".encode("ascii"))
Example #8
0
    class Writer(ColumnWriter):
        def __init__(self, dbfile, allow_offsets=True, cutoff=2**15):
            assert isinstance(dbfile, StructFile)
            self._dbfile = dbfile
            self._count = 0
            self._lengths = GrowableArray(allow_longs=False)
            self._offsets = GrowableArray(allow_longs=False)
            self._offset_base = 0
            self.allow_offsets = allow_offsets
            self.cutoff = cutoff

        def __repr__(self):
            return "<VarBytes.Writer>"

        def fill(self, docnum):
            base = self._offset_base
            if docnum > self._count:
                self._lengths.extend(0 for _ in xrange(docnum - self._count))
                self._offsets.extend(base for _ in xrange(docnum - self._count))

        def add(self, docnum, v):
            self.fill(docnum)
            self._dbfile.write(v)
            self._lengths.append(len(v))
            self._offsets.append(self._offset_base)
            self._offset_base += len(v)
            self._count = docnum + 1

        def finish(self, doccount):
            dbfile = self._dbfile
            lengths = self._lengths.array
            offsets = self._offsets.array
            self.fill(doccount)

            dbfile.write_array(lengths)

            # Only write the offsets if there is a large number of items in the
            # column, otherwise it's fast enough to derive them from the lens
            write_offsets = self.allow_offsets and doccount > self.cutoff
            if write_offsets:
                dbfile.write_array(offsets)

            # Backwards compatibility: previous versions only wrote the lengths,
            # and the last byte of the column was the lengths type code...
            dbfile.write(lengths.typecode.encode("ascii"))
            # ...but if we wrote offsets, make the last byte "X" so we know
            if write_offsets:
                dbfile.write(offsets.typecode.encode("ascii"))
                dbfile.write("X".encode("ascii"))