class Writer(ColumnWriter): def __init__(self, dbfile): assert isinstance(dbfile, StructFile) self._dbfile = dbfile self._count = 0 self._lengths = GrowableArray(allow_longs=False) def __repr__(self): return "<VarBytes.Writer>" def fill(self, docnum): if docnum > self._count: self._lengths.extend(0 for _ in xrange(docnum - self._count)) def add(self, docnum, v): self.fill(docnum) self._dbfile.write(v) self._lengths.append(len(v)) self._count = docnum + 1 def finish(self, doccount): self.fill(doccount) lengths = self._lengths.array self._dbfile.write_array(lengths) # Write the typecode for the lengths self._dbfile.write_byte(ord(lengths.typecode))
class Writer(ColumnWriter): def __init__(self, dbfile, allow_offsets=True, cutoff=2**15): assert isinstance(dbfile, StructFile) self._dbfile = dbfile self._count = 0 self._lengths = GrowableArray(allow_longs=False) self._offsets = GrowableArray(allow_longs=False) self._offset_base = 0 self.allow_offsets = allow_offsets self.cutoff = cutoff def __repr__(self): return "<VarBytes.Writer>" def fill(self, docinfo): docnum, docbase = docinfo base = self._offset_base if docnum - docbase > self._count: self._lengths.extend( 0 for _ in xrange((docnum - docbase) - self._count)) self._offsets.extend( base for _ in xrange((docnum - docbase) - self._count)) def add(self, docinfo, v): docnum, docbase = docinfo self.fill(docinfo) self._dbfile.write(v) self._lengths.append(len(v)) self._offsets.append(self._offset_base) self._offset_base += len(v) self._count = (docnum - docbase) + 1 def finish(self, docinfo): docnum, docbase = docinfo dbfile = self._dbfile lengths = self._lengths.array offsets = self._offsets.array self.fill(docinfo) dbfile.write_array(lengths) # Only write the offsets if there is a large number of items in the # column, otherwise it's fast enough to derive them from the lens write_offsets = (self.allow_offsets and docnum - docbase > self.cutoff) if write_offsets: dbfile.write_array(offsets) # Backwards compatibility: previous versions only wrote the lengths, # and the last byte of the column was the lengths type code... dbfile.write(lengths.typecode.encode("ascii")) # ...but if we wrote offsets, make the last byte "X" so we know if write_offsets: dbfile.write(offsets.typecode.encode("ascii")) dbfile.write("X".encode("ascii"))
class Writer(ColumnWriter): def __init__(self, dbfile, allow_offsets=True, cutoff=2**15): assert isinstance(dbfile, StructFile) self._dbfile = dbfile self._count = 0 self._lengths = GrowableArray(allow_longs=False) self._offsets = GrowableArray(allow_longs=False) self._offset_base = 0 self.allow_offsets = allow_offsets self.cutoff = cutoff def __repr__(self): return "<VarBytes.Writer>" def fill(self, docnum): base = self._offset_base if docnum > self._count: self._lengths.extend(0 for _ in xrange(docnum - self._count)) self._offsets.extend(base for _ in xrange(docnum - self._count)) def add(self, docnum, v): self.fill(docnum) self._dbfile.write(v) self._lengths.append(len(v)) self._offsets.append(self._offset_base) self._offset_base += len(v) self._count = docnum + 1 def finish(self, doccount): dbfile = self._dbfile lengths = self._lengths.array offsets = self._offsets.array self.fill(doccount) dbfile.write_array(lengths) # Only write the offsets if there is a large number of items in the # column, otherwise it's fast enough to derive them from the lens write_offsets = self.allow_offsets and doccount > self.cutoff if write_offsets: dbfile.write_array(offsets) # Backwards compatibility: previous versions only wrote the lengths, # and the last byte of the column was the lengths type code... dbfile.write(lengths.typecode.encode("ascii")) # ...but if we wrote offsets, make the last byte "X" so we know if write_offsets: dbfile.write(offsets.typecode.encode("ascii")) dbfile.write("X".encode("ascii"))