def test_pickle_schema(): from whoosh import analysis from whoosh.support.charset import accent_map from whoosh.compat import dumps freetext_analyzer = (analysis.StemmingAnalyzer() | analysis.CharsetFilter(accent_map)) schema = fields.Schema(path=fields.ID(stored=True, unique=True), file_mtime=fields.DATETIME(stored=True), name=fields.TEXT(stored=False, field_boost=2.0), description=fields.TEXT(stored=False, field_boost=1.5, analyzer=freetext_analyzer), content=fields.TEXT(analyzer=freetext_analyzer)) # Try to make some sentences that will require stemming docs = [ u"The rain in spain falls mainly in the plain", u"Plainly sitting on the plain", u"Imagine a greatly improved sentence here" ] with TempIndex(schema) as ix: with ix.writer() as w: for doc in docs: w.add_document(description=doc, content=doc) assert dumps(schema, 2) with ix.reader() as r: assert dumps(r.schema, 2)
def test_pickle_schema(): from whoosh import analysis from whoosh.support.charset import accent_map from whoosh.compat import dumps freetext_analyzer = ( analysis.StemmingAnalyzer() | analysis.CharsetFilter(accent_map) ) schema = fields.Schema( path=fields.ID(stored=True, unique=True), file_mtime=fields.DATETIME(stored=True), name=fields.TEXT(stored=False, field_boost=2.0), description=fields.TEXT(stored=False, field_boost=1.5, analyzer=freetext_analyzer), content=fields.TEXT(analyzer=freetext_analyzer) ) # Try to make some sentences that will require stemming docs = [ u"The rain in spain falls mainly in the plain", u"Plainly sitting on the plain", u"Imagine a greatly improved sentence here" ] with TempIndex(schema) as ix: with ix.writer() as w: for doc in docs: w.add_document(description=doc, content=doc) assert dumps(schema, 2) with ix.reader() as r: assert dumps(r.schema, 2)
def test_charset_pickeability(): from whoosh.support import charset charmap = charset.charset_table_to_dict(charset.default_charset) ana = analysis.StandardAnalyzer() | analysis.CharsetFilter(charmap) _ = dumps(ana, -1) ana = analysis.CharsetTokenizer(charmap) _ = dumps(ana, -1)
def _write_block(self, last=False): # Write the buffered block to the postings file # If this is the first block, write a small header first if not self._blockcount: self._postfile.write(WHOOSH3_HEADER_MAGIC) # Add this block's statistics to the terminfo object, which tracks the # overall statistics for all term postings self._terminfo.add_block(self) # Minify the IDs, weights, and values, and put them in a tuple data = (self._mini_ids(), self._mini_weights(), self._mini_values()) # Pickle the tuple databytes = dumps(data) # If the pickle is less than 20 bytes, don't bother compressing if len(databytes) < 20: comp = 0 # Compress the pickle (if self._compression > 0) comp = self._compression if comp: databytes = zlib.compress(databytes, comp) # Make a tuple of block info. The posting reader can check this info # and decide whether to skip the block without having to decompress the # full block data # # - Number of postings in block # - Last ID in block # - Maximum weight in block # - Compression level # - Minimum length byte # - Maximum length byte ids = self._ids infobytes = dumps(( len(ids), ids[-1], self._maxweight, comp, length_to_byte(self._minlength), length_to_byte(self._maxlength), )) # Write block length postfile = self._postfile blocklength = len(infobytes) + len(databytes) if last: # If this is the last block, use a negative number blocklength *= -1 postfile.write_int(blocklength) # Write block info postfile.write(infobytes) # Write block data postfile.write(databytes) self._blockcount += 1 # Reset block buffer self._new_block()
def _write_block(self, last=False): # Write the buffered block to the postings file # If this is the first block, write a small header first if not self._blockcount: self._postfile.write(WHOOSH3_HEADER_MAGIC) # Add this block's statistics to the terminfo object, which tracks the # overall statistics for all term postings self._terminfo.add_block(self) # Minify the IDs, weights, and values, and put them in a tuple data = (self._mini_ids(), self._mini_weights(), self._mini_values()) # Pickle the tuple databytes = dumps(data) # If the pickle is less than 20 bytes, don't bother compressing if len(databytes) < 20: comp = 0 # Compress the pickle (if self._compression > 0) comp = self._compression if comp: databytes = zlib.compress(databytes, comp) # Make a tuple of block info. The posting reader can check this info # and decide whether to skip the block without having to decompress the # full block data # # - Number of postings in block # - Last ID in block # - Maximum weight in block # - Compression level # - Minimum length byte # - Maximum length byte ids = self._ids infobytes = dumps((len(ids), ids[-1], self._maxweight, comp, length_to_byte(self._minlength), length_to_byte(self._maxlength), )) # Write block length postfile = self._postfile blocklength = len(infobytes) + len(databytes) if last: # If this is the last block, use a negative number blocklength *= -1 postfile.write_int(blocklength) # Write block info postfile.write(infobytes) # Write block data postfile.write(databytes) self._blockcount += 1 # Reset block buffer self._new_block()
def encode(self, poslist): deltas = [] base = 0 for pos in poslist: deltas.append(pos - base) base = pos return pack_uint(len(deltas)) + dumps(deltas, -1)[2:-1]
def test_pickleability(): # Ignore base classes ignore = (columns.Column, columns.WrappedColumn, columns.ListColumn) # Required arguments init_args = { "ClampedNumericColumn": (columns.NumericColumn("B"), ), "FixedBytesColumn": (5, ), "FixedBytesListColumn": (5, ), "NumericColumn": ("i", ), "PickleColumn": (columns.VarBytesColumn(), ), "StructColumn": ("=if", (0, 0.0)), } coltypes = [ c for _, c in inspect.getmembers(columns, inspect.isclass) if issubclass(c, columns.Column) and not c in ignore ] for coltype in coltypes: args = init_args.get(coltype.__name__, ()) try: inst = coltype(*args) except TypeError: e = sys.exc_info()[1] raise TypeError("Error instantiating %r: %s" % (coltype, e)) _ = loads(dumps(inst, -1))
def encode(self, positions): codes = [] base = 0 for pos in positions: codes.append(pos - base) base = pos return pack_uint(len(codes)) + dumps(codes, -1)[2:-1]
def word_values(self, value, analyzer, **kwargs): fb = self.field_boost seen = defaultdict(list) kwargs["positions"] = True kwargs["chars"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost)) for w, poses in iteritems(seen): # posns_chars_boosts = [(pos, startchar, endchar, boost), ...] codes = [] posbase = 0 charbase = 0 summedboost = 0 for pos, startchar, endchar, boost in poses: codes.append((pos - posbase, startchar - charbase, endchar - startchar, boost)) posbase = pos charbase = endchar summedboost += boost value = (pack_uint(len(poses)) + pack_float(summedboost * fb) + dumps(codes, -1)[2:-1]) yield (w, len(poses), summedboost * fb, value)
def to_string(self): # Encode the lengths as 0-255 values ml = 0 if self._minlength is None else length_to_byte(self._minlength) xl = length_to_byte(self._maxlength) # Convert None values to the out-of-band NO_ID constant so they can be # stored as unsigned ints mid = NO_ID if self._minid is None else self._minid xid = NO_ID if self._maxid is None else self._maxid # Pack the term info into bytes st = self.struct.pack(self._weight, self._df, ml, xl, self._maxweight, 0, mid, xid) if isinstance(self.postings, tuple): # Postings are inlined - dump them using the pickle protocol isinlined = 1 st += dumps(self.postings, -1)[2:-1] else: # Append postings pointer as long to end of term info bytes isinlined = 0 # It's possible for a term info to not have a pointer to postings # on disk, in which case postings will be None. Convert a None # value to -1 so it can be stored as a long. p = -1 if self.postings is None else self.postings st += pack_long(p) # Prepend byte indicating whether the postings are inlined to the term # info bytes return pack("B", isinlined) + st
def encode(self, poslist): deltas = [] posbase = 0 charbase = 0 for pos, startchar, endchar in poslist: deltas.append((pos - posbase, startchar - charbase, endchar - startchar)) posbase = pos charbase = endchar return pack_uint(len(deltas)) + dumps(deltas, -1)[2:-1]
def encode(self, poses): codes = [] base = 0 summedboost = 0 for pos, boost in poses: summedboost += boost codes.append((pos - base, boost)) base = pos return (pack_uint(len(poses)) + pack_float(summedboost) + dumps(codes, -1)[2:-1])
def minimize_values(postingsize, values, compression=0): if postingsize < 0: string = dumps(values, -1)[2:] elif postingsize == 0: string = b('') else: string = b('').join(values) if string and compression: string = compress(string, compression) return string
def encode(self, poslist): deltas = [] posbase = 0 charbase = 0 for pos, startchar, endchar in poslist: deltas.append( (pos - posbase, startchar - charbase, endchar - startchar)) posbase = pos charbase = endchar return pack_uint(len(deltas)) + dumps(deltas, -1)[2:-1]
def encode(self, posns_chars): # posns_chars = [(pos, startchar, endchar), ...] codes = [] posbase = 0 charbase = 0 for pos, startchar, endchar in posns_chars: codes.append((pos - posbase, startchar - charbase, endchar - startchar)) posbase = pos charbase = endchar return pack_uint(len(posns_chars)) + dumps(codes, -1)[2:-1]
def encode(self, posns_chars): # posns_chars = [(pos, startchar, endchar), ...] codes = [] posbase = 0 charbase = 0 for pos, startchar, endchar in posns_chars: codes.append( (pos - posbase, startchar - charbase, endchar - startchar)) posbase = pos charbase = endchar return pack_uint(len(posns_chars)) + dumps(codes, -1)[2:-1]
def encode(self, posns_chars_boosts): # posns_chars_boosts = [(pos, startchar, endchar, boost), ...] codes = [] posbase = 0 charbase = 0 summedboost = 0 for pos, startchar, endchar, boost in posns_chars_boosts: codes.append((pos - posbase, startchar - charbase, endchar - startchar, boost)) posbase = pos charbase = endchar summedboost += boost return (pack_uint(len(posns_chars_boosts)) + pack_float(summedboost) + dumps(codes, -1)[2:-1])
def encode(self, poses): fb = self.field_boost # posns_chars_boosts = [(pos, startchar, endchar, boost), ...] codes = [] posbase = 0 charbase = 0 summedboost = 0 for pos, startchar, endchar, boost in poses: codes.append((pos - posbase, startchar - charbase, endchar - startchar, boost)) posbase = pos charbase = endchar summedboost += boost return ((pack_uint(len(poses)) + pack_float(summedboost * fb) + dumps(codes, 2)), summedboost)
def append(self, values): f = self.dbfile name_map = self.name_map vlist = [None] * len(name_map) for k, v in iteritems(values): if k in name_map: vlist[name_map[k]] = v else: # For dynamic stored fields, put them at the end of the list # as a tuple of (fieldname, value) vlist.append((k, v)) v = dumps(vlist, -1)[2:-1] self.length += 1 self.directory.append(pack_stored_pointer(f.tell(), len(v))) f.write(v)
def add(self, vdict): f = self.dbfile names = self.names name_map = self.name_map vlist = [None] * len(names) for k, v in iteritems(vdict): if k in name_map: vlist[name_map[k]] = v else: name_map[k] = len(names) names.append(k) vlist.append(v) vstring = dumps(tuple(vlist), -1)[2:-1] self.length += 1 self.directory.append(pack_stored_pointer(f.tell(), len(vstring))) f.write(vstring)
def word_values(self, value, analyzer, **kwargs): fb = self.field_boost poses = defaultdict(list) weights = defaultdict(float) kwargs["positions"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): poses[t.text].append(t.pos) weights[t.text] += t.boost for w, poslist in iteritems(poses): deltas = [] base = 0 for pos in poslist: deltas.append(pos - base) base = pos value = pack_uint(len(deltas)) + dumps(deltas, -1)[2:-1] yield (w, len(poslist), weights[w] * fb, value)
def to_file(self, postfile, compression=3): ids = self.ids idcode, idstring = minimize_ids(ids, self.stringids, compression) wtstring = minimize_weights(self.weights, compression) vstring = minimize_values(self.postingsize, self.values, compression) info = (len(ids), ids[-1], self.maxweight, length_to_byte(self.minlength), length_to_byte(self.maxlength), idcode, compression, len(idstring), len(wtstring)) infostring = dumps(info, -1) # Offset to next block postfile.write_uint( len(infostring) + len(idstring) + len(wtstring) + len(vstring)) # Block contents postfile.write(infostring) postfile.write(idstring) postfile.write(wtstring) postfile.write(vstring)
def to_file(self, postfile, compression=3): ids = self.ids idcode, idstring = minimize_ids(ids, self.stringids, compression) wtstring = minimize_weights(self.weights, compression) vstring = minimize_values(self.postingsize, self.values, compression) info = (len(ids), ids[-1], self.maxweight, length_to_byte(self.minlength), length_to_byte(self.maxlength), idcode, compression, len(idstring), len(wtstring)) infostring = dumps(info, -1) # Offset to next block postfile.write_uint(len(infostring) + len(idstring) + len(wtstring) + len(vstring)) # Block contents postfile.write(infostring) postfile.write(idstring) postfile.write(wtstring) postfile.write(vstring)
def minimize_ids(arry, stringids, compression=0): amax = arry[-1] if stringids: typecode = '' string = dumps(arry) else: code = arry.typecode if amax <= 255: typecode = "B" elif amax <= 65535: typecode = "H" if typecode != code: arry = array(typecode, iter(arry)) if not IS_LITTLE: arry.byteswap() string = arry.tostring() if compression: string = compress(string, compression) return (typecode, string)
def word_values(self, value, analyzer, **kwargs): fb = self.field_boost seen = defaultdict(list) kwargs["positions"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): pos = t.pos boost = t.boost seen[t.text].append((pos, boost)) for w, poses in iteritems(seen): codes = [] base = 0 summedboost = 0 for pos, boost in poses: summedboost += boost codes.append((pos - base, boost)) base = pos value = (pack_uint(len(poses)) + pack_float(summedboost) + dumps(codes, -1)[2:-1]) yield (w, len(poses), sum(p[1] for p in poses) * fb, value)
def test_pickleability(): # Ignore base classes ignore = (columns.Column, columns.WrappedColumn, columns.ListColumn) # Required arguments init_args = {"ClampedNumericColumn": (columns.NumericColumn("B"),), "FixedBytesColumn": (5,), "FixedBytesListColumn": (5,), "NumericColumn": ("i",), "PickleColumn": (columns.VarBytesColumn(),), "StructColumn": ("=if", (0, 0.0)), } coltypes = [c for _, c in inspect.getmembers(columns, inspect.isclass) if issubclass(c, columns.Column) and not c in ignore] for coltype in coltypes: args = init_args.get(coltype.__name__, ()) try: inst = coltype(*args) except TypeError: e = sys.exc_info()[1] raise TypeError("Error instantiating %r: %s" % (coltype, e)) _ = loads(dumps(inst, -1))
def word_values(self, value, analyzer, **kwargs): fb = self.field_boost seen = defaultdict(list) weights = defaultdict(float) kwargs["positions"] = True kwargs["chars"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): seen[t.text].append((t.pos, t.startchar, t.endchar)) weights[t.text] += t.boost for w, poslist in iteritems(seen): deltas = [] posbase = 0 charbase = 0 for pos, startchar, endchar in poslist: deltas.append((pos - posbase, startchar - charbase, endchar - startchar)) posbase = pos charbase = endchar value = pack_uint(len(deltas)) + dumps(deltas, -1)[2:-1] yield (w, len(poslist), weights[w] * fb, value)
def to_bytes(self): isinlined = self.is_inlined() # Encode the lengths as 0-255 values minlength = (0 if self._minlength is None else length_to_byte( self._minlength)) maxlength = length_to_byte(self._maxlength) # Convert None values to the out-of-band NO_ID constant so they can be # stored as unsigned ints minid = 0xffffffff if self._minid is None else self._minid maxid = 0xffffffff if self._maxid is None else self._maxid # Pack the term info into bytes st = self._struct.pack(isinlined, self._weight, self._df, minlength, maxlength, self._maxweight, minid, maxid) if isinlined: # Postings are inlined - _tmp them using the pickle protocol postbytes = dumps(self._inlined, -1) else: postbytes = pack_long(self._offset) + pack_int(self._length) st += postbytes return st
def to_bytes(self): isinlined = self.is_inlined() # Encode the lengths as 0-255 values minlength = (0 if self._minlength is None else length_to_byte(self._minlength)) maxlength = length_to_byte(self._maxlength) # Convert None values to the out-of-band NO_ID constant so they can be # stored as unsigned ints minid = 0xffffffff if self._minid is None else self._minid maxid = 0xffffffff if self._maxid is None else self._maxid # Pack the term info into bytes st = self._struct.pack(isinlined, self._weight, self._df, minlength, maxlength, self._maxweight, minid, maxid) if isinlined: # Postings are inlined - dump them using the pickle protocol postbytes = dumps(self._inlined, -1) else: postbytes = pack_long(self._offset) + pack_int(self._length) st += postbytes return st
def test_la_pickleability(): ana = analysis.LanguageAnalyzer("en") _ = dumps(ana, -1)
def add(self, docnum, v): if v is None: v = emptybytes else: v = dumps(v, -1) self._child.add(docnum, v)
def add_field(self, fieldname, fieldobj, value, length): if value is not None: value = dumps(value, -1) self._print_line(2, "DOCFIELD", fn=fieldname, v=value, len=length)
def write(self, compression=3): postfile = self.postfile stringids = self.stringids ids = self.ids weights = self.weights values = self.values postcount = len(ids) if postcount <= 4 or not can_compress: compression = 0 # Max ID maxid = ids[-1] if stringids: maxid_string = dumps(maxid, -1)[2:] else: maxid_string = pack_uint(maxid) # IDs typecode = "I" if stringids: ids_string = dumps(ids, -1)[2:] typecode = "s" else: if maxid <= 255: typecode = "B" elif maxid <= 65535: typecode = "H" if typecode != ids.typecode: ids = array(typecode, iter(ids)) if not IS_LITTLE: ids.byteswap() ids_string = ids.tostring() if compression: ids_string = compress(ids_string, compression) # Weights if all(w == 1.0 for w in weights): weights_string = b('') else: if not IS_LITTLE: weights.byteswap() weights_string = weights.tostring() if weights_string and compression: weights_string = compress(weights_string, compression) # Values postingsize = self.postingsize if postingsize < 0: values_string = dumps(values, -1)[2:] elif postingsize == 0: values_string = b('') else: values_string = b("").join(values) if values_string and compression: values_string = compress(values_string, compression) # Header flags = 1 if compression else 0 blocksize = sum((self._struct.size, len(maxid_string), len(ids_string), len(weights_string), len(values_string))) header = self._struct.pack(blocksize, flags, postcount, typecode.encode('latin-1'), 0, len(ids_string), len(weights_string), self.max_weight(), self.max_wol(), 0, 0, self._maxlength, self._minlength or 0) postfile.write(header) postfile.write(maxid_string) postfile.write(ids_string) postfile.write(weights_string) postfile.write(values_string)
def add(self, docnum, v): if v is None: v = emptybytes else: v = dumps(v, 2) self._child.add(docnum, v)