def from_string(cls, s): hbyte = ord(s[0:1]) if hbyte < 2: st = cls.struct # Freq, Doc freq, min len, max len, max w, max WOL, min ID, max ID f, df, ml, xl, xw, xwol, mid, xid = st.unpack(s[1:st.size + 1]) mid = None if mid == NO_ID else mid xid = None if xid == NO_ID else xid # Postings pstr = s[st.size + 1:] if hbyte == 0: p = unpack_long(pstr)[0] else: p = loads(pstr + b(".")) else: # Old format was encoded as a variable length pickled tuple v = loads(s + b(".")) if len(v) == 1: f = df = 1 p = v[0] elif len(v) == 2: f = df = v[1] p = v[0] else: f, p, df = v # Fake values for stats which weren't stored before ml = 1 xl = 106374 xw = 999999999 xwol = 999999999 mid = -1 xid = -1 return cls(f, df, ml, xl, xw, xwol, mid, xid, p)
def from_bytes(cls, s): st = cls._struct vals = st.unpack(s[:st.size]) terminfo = cls() flags = vals[0] terminfo._weight = vals[1] terminfo._df = vals[2] terminfo._minlength = byte_to_length(vals[3]) terminfo._maxlength = byte_to_length(vals[4]) terminfo._maxweight = vals[5] terminfo._minid = None if vals[6] == 0xffffffff else vals[6] terminfo._maxid = None if vals[7] == 0xffffffff else vals[7] if flags: # Postings are stored inline terminfo._inlined = loads(s[st.size:]) else: # Last bytes are pointer into posting file and length offpos = st.size lenpos = st.size + _LONG_SIZE terminfo._offset = unpack_long(s[offpos:lenpos])[0] terminfo._length = unpack_int(s[lenpos:lenpos + _INT_SIZE]) return terminfo
def test_pickleability(): # Ignore base classes ignore = (columns.Column, columns.WrappedColumn, columns.ListColumn) # Required arguments init_args = { "ClampedNumericColumn": (columns.NumericColumn("B"), ), "FixedBytesColumn": (5, ), "FixedBytesListColumn": (5, ), "NumericColumn": ("i", ), "PickleColumn": (columns.VarBytesColumn(), ), "StructColumn": ("=if", (0, 0.0)), } coltypes = [ c for _, c in inspect.getmembers(columns, inspect.isclass) if issubclass(c, columns.Column) and not c in ignore ] for coltype in coltypes: args = init_args.get(coltype.__name__, ()) try: inst = coltype(*args) except TypeError: e = sys.exc_info()[1] raise TypeError("Error instantiating %r: %s" % (coltype, e)) _ = loads(dumps(inst, -1))
def __getitem__(self, num): if num > self.length - 1: raise IndexError("Tried to get document %s, file has %s" % (num, self.length)) dbfile = self.dbfile start = self.directory_offset + num * stored_pointer_size dbfile.seek(start) ptr = dbfile.read(stored_pointer_size) if len(ptr) != stored_pointer_size: raise Exception("Error reading %r @%s %s < %s" % (dbfile, start, len(ptr), stored_pointer_size)) position, length = unpack_stored_pointer(ptr) vlist = loads(dbfile.map[position:position + length] + b(".")) names = self.names # Recreate a dictionary by putting the field names and values back # together by position. We can't just use dict(zip(...)) because we # want to filter out the None values. values = dict((names[i], vlist[i]) for i in xrange(len(names)) if vlist[i] is not None) # Pull any extra stored dynamic field values off the end of the list if len(vlist) > len(names): values.update(dict(vlist[len(names):])) return values
def decode_positions(self, valuestring): codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + b(".")) position = 0 posns = [] for code in codes: position = code[0] + position posns.append(position) return posns
def _read_stored_fields(self): sfs = {} c = self._find_line(2, "DOCFIELD") while c is not None: v = c.get("v") if v is not None: v = loads(v) sfs[c["fn"]] = v c = self._find_line(2, "DOCFIELD") return sfs
def decode_positions(self, valuestring): if not valuestring.endswith(b(".")): valuestring += b(".") codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:]) position = 0 posns = [] for code in codes: position = code[0] + position posns.append(position) return posns
def from_string(cls, s): assert isinstance(s, bytes_type) if isinstance(s, string_type): hbyte = ord(s[0]) # Python 2.x - str else: hbyte = s[0] # Python 3 - bytes if hbyte < 2: st = cls.struct # Weight, Doc freq, min len, max len, max w, unused, min ID, max ID w, df, ml, xl, xw, _, mid, xid = st.unpack(s[1:st.size + 1]) mid = None if mid == NO_ID else mid xid = None if xid == NO_ID else xid # Postings pstr = s[st.size + 1:] if hbyte == 0: p = unpack_long(pstr)[0] else: p = loads(pstr + b(".")) else: # Old format was encoded as a variable length pickled tuple v = loads(s + b(".")) if len(v) == 1: w = df = 1 p = v[0] elif len(v) == 2: w = df = v[1] p = v[0] else: w, p, df = v # Fake values for stats which weren't stored before ml = 1 xl = 255 xw = 999999999 mid = -1 xid = -1 ml = byte_to_length(ml) xl = byte_to_length(xl) obj = cls(w, df, ml, xl, xw, mid, xid) obj.postings = p return obj
def decode_characters(self, valuestring): codes = loads(valuestring[_INT_SIZE:] + b(".")) position = 0 endchar = 0 posns_chars = [] for code in codes: position = code[0] + position startchar = code[1] + endchar endchar = code[2] + startchar posns_chars.append((position, startchar, endchar)) return posns_chars
def deminimize_ids(typecode, count, string, compression=0): if compression: string = decompress(string) if typecode == '': return loads(string) else: arry = array(typecode) arry.fromstring(string) if not IS_LITTLE: arry.byteswap() return arry
def decode_character_boosts(self, valuestring): codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + b(".")) position = 0 endchar = 0 posn_char_boosts = [] for code in codes: position = position + code[0] startchar = endchar + code[1] endchar = startchar + code[2] posn_char_boosts.append((position, startchar, endchar, code[3])) return posn_char_boosts
def deminimize_values(postingsize, count, string, compression=0): if compression: string = decompress(string) if postingsize < 0: return loads(string) elif postingsize == 0: return [None] * count else: return [string[i:i + postingsize] for i in xrange(0, len(string), postingsize)]
def _read_data(self): # Load block data tuple from disk datalen = self._nextoffset - self._dataoffset b = self._postfile.get(self._dataoffset, datalen) # Decompress the pickled data if necessary if self._compression: b = zlib.decompress(b) # Unpickle the data tuple and save it in an attribute self._data = loads(b)
def read_ids(self): dataoffset = self.dataoffset ids_string = self.postfile.map[dataoffset:dataoffset + self.idslen] if self.compression: ids_string = decompress(ids_string) if self.stringids: ids = loads(ids_string) else: ids = array(self.typecode) ids.fromstring(ids_string) if not IS_LITTLE: ids.byteswap() self.ids = ids return ids
def __iter__(self): dbfile = self.dbfile names = self.names lengths = array("I") dbfile.seek(self.directory_offset) for i in xrange(self.length): dbfile.seek(_LONG_SIZE, 1) lengths.append(dbfile.read_uint()) dbfile.seek(self.basepos) for length in lengths: vlist = loads(dbfile.read(length) + b(".")) vdict = dict((names[i], vlist[i]) for i in xrange(len(vlist)) if vlist[i] is not None) yield vdict
def read_values(self): postingsize = self.postingsize if postingsize == 0: values = [None] * self.postcount else: offset = self.dataoffset + self.idslen + self.weightslen values_string = self.postfile.map[offset:self.nextoffset] if self.compression: values_string = decompress(values_string) if postingsize < 0: values = loads(values_string) else: values = [values_string[i:i + postingsize] for i in xrange(0, len(values_string), postingsize)] self.values = values return values
def read_values(self): postingsize = self.postingsize if postingsize == 0: values = [None] * self.postcount else: offset = self.dataoffset + self.idslen + self.weightslen values_string = self.postfile.map[offset:self.nextoffset] if self.compression: values_string = decompress(values_string) if postingsize < 0: values = loads(values_string) else: values = [ values_string[i:i + postingsize] for i in xrange(0, len(values_string), postingsize) ] self.values = values return values
def test_pickleability(): # Ignore base classes ignore = (columns.Column, columns.WrappedColumn, columns.ListColumn) # Required arguments init_args = {"ClampedNumericColumn": (columns.NumericColumn("B"),), "FixedBytesColumn": (5,), "FixedBytesListColumn": (5,), "NumericColumn": ("i",), "PickleColumn": (columns.VarBytesColumn(),), "StructColumn": ("=if", (0, 0.0)), } coltypes = [c for _, c in inspect.getmembers(columns, inspect.isclass) if issubclass(c, columns.Column) and not c in ignore] for coltype in coltypes: args = init_args.get(coltype.__name__, ()) try: inst = coltype(*args) except TypeError: e = sys.exc_info()[1] raise TypeError("Error instantiating %r: %s" % (coltype, e)) _ = loads(dumps(inst, -1))
def __iter__(self): for v in self._child: if not v: yield None else: yield loads(v)
def __getitem__(self, docnum): v = self._child[docnum] if not v: return None else: return loads(v)