def read(cls, storage, indexname, gen=None, schema=None): if gen is None: gen = cls._latest_generation(storage, indexname) if gen < 0: raise EmptyIndexError("Index %r does not exist in %r" % (indexname, storage)) tocfilename = cls._filename(indexname, gen) stream = storage.open_file(tocfilename) stream = ChecksumFile(stream) # Do general sanity checks at the beginning and read the version # numbers toc_version, release = cls._read_preamble(stream) if toc_version != _CURRENT_TOC_VERSION: # If there's a backwards-compatible loader function for this # version, use it to load the rest of the TOC if toc_version in toc_loaders: loader = toc_loaders[toc_version] schema, segments = loader(stream, gen, schema, toc_version) else: # Otherwise, raise an error raise IndexVersionError("Can't read format %s" % toc_version, toc_version) else: loader = cls._read_info schema, segments = loader(stream, gen, schema, toc_version) file_check = stream.checksum() orig_check = stream.read_uint() if file_check != orig_check: raise Exception("TOC checksum does not match %d != %d" % (file_check, orig_check)) stream.close() return cls(schema, segments, gen, version=toc_version, release=release)
def write(self, storage, indexname): schema = ensure_schema(self.schema) schema.clean() # Use a temporary file for atomic write tocfilename = self._filename(indexname, self.generation) tempfilename = '%s.%s' % (tocfilename, time()) stream = storage.create_file(tempfilename) stream = ChecksumFile(stream) # Write the sanity checks and version numbers self._write_preamble(stream) # Write pickles as strings to allow them to be skipped try: stream.write_string(pickle.dumps(schema, -1)) except pickle.PicklingError: # Try to narrow down the error to a single field for fieldname, field in schema.items(): try: pickle.dumps(field) except pickle.PicklingError: e = sys.exc_info()[1] raise pickle.PicklingError("%s %s=%r" % (e, fieldname, field)) # Otherwise, re-raise the original exception raise # Write the list of segments stream.write_varint(len(self.segments)) for segment in self.segments: # Write the segment's module and class name before the pickle to # possibly allow later versions to load the segment differently # based on the class (for backwards compatibility) segtype = segment.__class__ typename = "%s.%s" % (segtype.__module__, segtype.__name__) stream.write_string(typename.encode("latin1")) stream.write_string(pickle.dumps(segment, -1)) stream.write_uint(stream.checksum()) stream.close() storage.rename_file(tempfilename, tocfilename, safe=True)
def test_checksum_file(): from whoosh.filedb.structfile import ChecksumFile from zlib import crc32 def wr(f): f.write(b("Testing")) f.write_int(-100) f.write_varint(10395) f.write_string(b("Hello")) f.write_ushort(32959) st = RamStorage() # Write a file normally f = st.create_file("control") wr(f) f.close() # Checksum the contents f = st.open_file("control") target = crc32(f.read()) & 0xffffffff f.close() # Write a file with checksumming f = st.create_file("test") cf = ChecksumFile(f) wr(cf) assert cf.checksum() == target f.close() # Read the file with checksumming f = st.open_file("test") cf = ChecksumFile(f) assert cf.read(7) == b("Testing") assert cf.read_int() == -100 assert cf.read_varint() == 10395 assert cf.read_string() == b("Hello") assert cf.read_ushort() == 32959 assert cf.checksum() == target cf.close()