Example #1
0
    def from_string(cls, s):
        hbyte = ord(s[0:1])
        if hbyte < 2:
            st = cls.struct
            # Freq, Doc freq, min len, max len, max w, max WOL, min ID, max ID
            f, df, ml, xl, xw, xwol, mid, xid = st.unpack(s[1:st.size + 1])
            mid = None if mid == NO_ID else mid
            xid = None if xid == NO_ID else xid
            # Postings
            pstr = s[st.size + 1:]
            if hbyte == 0:
                p = unpack_long(pstr)[0]
            else:
                p = loads(pstr + b("."))
        else:
            # Old format was encoded as a variable length pickled tuple
            v = loads(s + b("."))
            if len(v) == 1:
                f = df = 1
                p = v[0]
            elif len(v) == 2:
                f = df = v[1]
                p = v[0]
            else:
                f, p, df = v
            # Fake values for stats which weren't stored before
            ml = 1
            xl = 106374
            xw = 999999999
            xwol = 999999999
            mid = -1
            xid = -1

        return cls(f, df, ml, xl, xw, xwol, mid, xid, p)
Example #2
0
    def from_bytes(cls, s):
        st = cls._struct
        vals = st.unpack(s[:st.size])
        terminfo = cls()

        flags = vals[0]
        terminfo._weight = vals[1]
        terminfo._df = vals[2]
        terminfo._minlength = byte_to_length(vals[3])
        terminfo._maxlength = byte_to_length(vals[4])
        terminfo._maxweight = vals[5]
        terminfo._minid = None if vals[6] == 0xffffffff else vals[6]
        terminfo._maxid = None if vals[7] == 0xffffffff else vals[7]

        if flags:
            # Postings are stored inline
            terminfo._inlined = loads(s[st.size:])
        else:
            # Last bytes are pointer into posting file and length
            offpos = st.size
            lenpos = st.size + _LONG_SIZE
            terminfo._offset = unpack_long(s[offpos:lenpos])[0]
            terminfo._length = unpack_int(s[lenpos:lenpos + _INT_SIZE])

        return terminfo
Example #3
0
def test_pickleability():
    # Ignore base classes
    ignore = (columns.Column, columns.WrappedColumn, columns.ListColumn)
    # Required arguments
    init_args = {
        "ClampedNumericColumn": (columns.NumericColumn("B"), ),
        "FixedBytesColumn": (5, ),
        "FixedBytesListColumn": (5, ),
        "NumericColumn": ("i", ),
        "PickleColumn": (columns.VarBytesColumn(), ),
        "StructColumn": ("=if", (0, 0.0)),
    }

    coltypes = [
        c for _, c in inspect.getmembers(columns, inspect.isclass)
        if issubclass(c, columns.Column) and not c in ignore
    ]

    for coltype in coltypes:
        args = init_args.get(coltype.__name__, ())
        try:
            inst = coltype(*args)
        except TypeError:
            e = sys.exc_info()[1]
            raise TypeError("Error instantiating %r: %s" % (coltype, e))
        _ = loads(dumps(inst, -1))
Example #4
0
    def __getitem__(self, num):
        if num > self.length - 1:
            raise IndexError("Tried to get document %s, file has %s"
                             % (num, self.length))

        dbfile = self.dbfile
        start = self.directory_offset + num * stored_pointer_size
        dbfile.seek(start)
        ptr = dbfile.read(stored_pointer_size)
        if len(ptr) != stored_pointer_size:
            raise Exception("Error reading %r @%s %s < %s"
                            % (dbfile, start, len(ptr), stored_pointer_size))
        position, length = unpack_stored_pointer(ptr)
        vlist = loads(dbfile.map[position:position + length] + b("."))

        names = self.names
        # Recreate a dictionary by putting the field names and values back
        # together by position. We can't just use dict(zip(...)) because we
        # want to filter out the None values.
        values = dict((names[i], vlist[i]) for i in xrange(len(names))
                      if vlist[i] is not None)

        # Pull any extra stored dynamic field values off the end of the list
        if len(vlist) > len(names):
            values.update(dict(vlist[len(names):]))

        return values
    def from_bytes(cls, s):
        st = cls._struct
        vals = st.unpack(s[:st.size])
        terminfo = cls()

        flags = vals[0]
        terminfo._weight = vals[1]
        terminfo._df = vals[2]
        terminfo._minlength = byte_to_length(vals[3])
        terminfo._maxlength = byte_to_length(vals[4])
        terminfo._maxweight = vals[5]
        terminfo._minid = None if vals[6] == 0xffffffff else vals[6]
        terminfo._maxid = None if vals[7] == 0xffffffff else vals[7]

        if flags:
            # Postings are stored inline
            terminfo._inlined = loads(s[st.size:])
        else:
            # Last bytes are pointer into posting file and length
            offpos = st.size
            lenpos = st.size + _LONG_SIZE
            terminfo._offset = unpack_long(s[offpos:lenpos])[0]
            terminfo._length = unpack_int(s[lenpos:lenpos + _INT_SIZE])

        return terminfo
Example #6
0
 def decode_positions(self, valuestring):
     codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + b("."))
     position = 0
     posns = []
     for code in codes:
         position = code[0] + position
         posns.append(position)
     return posns
Example #7
0
 def decode_positions(self, valuestring):
     codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + b("."))
     position = 0
     posns = []
     for code in codes:
         position = code[0] + position
         posns.append(position)
     return posns
 def _read_stored_fields(self):
     sfs = {}
     c = self._find_line(2, "DOCFIELD")
     while c is not None:
         v = c.get("v")
         if v is not None:
             v = loads(v)
         sfs[c["fn"]] = v
         c = self._find_line(2, "DOCFIELD")
     return sfs
Example #9
0
 def decode_positions(self, valuestring):
     if not valuestring.endswith(b(".")):
         valuestring += b(".")
     codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:])
     position = 0
     posns = []
     for code in codes:
         position = code[0] + position
         posns.append(position)
     return posns
Example #10
0
    def from_string(cls, s):
        assert isinstance(s, bytes_type)

        if isinstance(s, string_type):
            hbyte = ord(s[0])  # Python 2.x - str
        else:
            hbyte = s[0]  # Python 3 - bytes

        if hbyte < 2:
            st = cls.struct
            # Weight, Doc freq, min len, max len, max w, unused, min ID, max ID
            w, df, ml, xl, xw, _, mid, xid = st.unpack(s[1:st.size + 1])
            mid = None if mid == NO_ID else mid
            xid = None if xid == NO_ID else xid
            # Postings
            pstr = s[st.size + 1:]
            if hbyte == 0:
                p = unpack_long(pstr)[0]
            else:
                p = loads(pstr + b("."))
        else:
            # Old format was encoded as a variable length pickled tuple
            v = loads(s + b("."))
            if len(v) == 1:
                w = df = 1
                p = v[0]
            elif len(v) == 2:
                w = df = v[1]
                p = v[0]
            else:
                w, p, df = v
            # Fake values for stats which weren't stored before
            ml = 1
            xl = 255
            xw = 999999999
            mid = -1
            xid = -1

        ml = byte_to_length(ml)
        xl = byte_to_length(xl)
        obj = cls(w, df, ml, xl, xw, mid, xid)
        obj.postings = p
        return obj
 def _read_stored_fields(self):
     sfs = {}
     c = self._find_line(2, "DOCFIELD")
     while c is not None:
         v = c.get("v")
         if v is not None:
             v = loads(v)
         sfs[c["fn"]] = v
         c = self._find_line(2, "DOCFIELD")
     return sfs
 def decode_positions(self, valuestring):
     if not valuestring.endswith(b(".")):
         valuestring += b(".")
     codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:])
     position = 0
     posns = []
     for code in codes:
         position = code[0] + position
         posns.append(position)
     return posns
Example #13
0
 def decode_characters(self, valuestring):
     codes = loads(valuestring[_INT_SIZE:] + b("."))
     position = 0
     endchar = 0
     posns_chars = []
     for code in codes:
         position = code[0] + position
         startchar = code[1] + endchar
         endchar = code[2] + startchar
         posns_chars.append((position, startchar, endchar))
     return posns_chars
Example #14
0
def deminimize_ids(typecode, count, string, compression=0):
    if compression:
        string = decompress(string)
    if typecode == '':
        return loads(string)
    else:
        arry = array(typecode)
        arry.fromstring(string)
        if not IS_LITTLE:
            arry.byteswap()
        return arry
Example #15
0
 def decode_character_boosts(self, valuestring):
     codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + b("."))
     position = 0
     endchar = 0
     posn_char_boosts = []
     for code in codes:
         position = position + code[0]
         startchar = endchar + code[1]
         endchar = startchar + code[2]
         posn_char_boosts.append((position, startchar, endchar, code[3]))
     return posn_char_boosts
Example #16
0
 def decode_character_boosts(self, valuestring):
     codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + b("."))
     position = 0
     endchar = 0
     posn_char_boosts = []
     for code in codes:
         position = position + code[0]
         startchar = endchar + code[1]
         endchar = startchar + code[2]
         posn_char_boosts.append((position, startchar, endchar, code[3]))
     return posn_char_boosts
Example #17
0
def deminimize_values(postingsize, count, string, compression=0):
    if compression:
        string = decompress(string)

    if postingsize < 0:
        return loads(string)
    elif postingsize == 0:
        return [None] * count
    else:
        return [string[i:i + postingsize] for i
                in xrange(0, len(string), postingsize)]
Example #18
0
 def decode_characters(self, valuestring):
     codes = loads(valuestring[_INT_SIZE:] + b("."))
     position = 0
     endchar = 0
     posns_chars = []
     for code in codes:
         position = code[0] + position
         startchar = code[1] + endchar
         endchar = code[2] + startchar
         posns_chars.append((position, startchar, endchar))
     return posns_chars
Example #19
0
    def _read_data(self):
        # Load block data tuple from disk

        datalen = self._nextoffset - self._dataoffset
        b = self._postfile.get(self._dataoffset, datalen)

        # Decompress the pickled data if necessary
        if self._compression:
            b = zlib.decompress(b)

        # Unpickle the data tuple and save it in an attribute
        self._data = loads(b)
    def _read_data(self):
        # Load block data tuple from disk

        datalen = self._nextoffset - self._dataoffset
        b = self._postfile.get(self._dataoffset, datalen)

        # Decompress the pickled data if necessary
        if self._compression:
            b = zlib.decompress(b)

        # Unpickle the data tuple and save it in an attribute
        self._data = loads(b)
Example #21
0
    def read_ids(self):
        dataoffset = self.dataoffset
        ids_string = self.postfile.map[dataoffset:dataoffset + self.idslen]
        if self.compression:
            ids_string = decompress(ids_string)

        if self.stringids:
            ids = loads(ids_string)
        else:
            ids = array(self.typecode)
            ids.fromstring(ids_string)
            if not IS_LITTLE:
                ids.byteswap()

        self.ids = ids
        return ids
Example #22
0
    def __iter__(self):
        dbfile = self.dbfile
        names = self.names
        lengths = array("I")

        dbfile.seek(self.directory_offset)
        for i in xrange(self.length):
            dbfile.seek(_LONG_SIZE, 1)
            lengths.append(dbfile.read_uint())

        dbfile.seek(self.basepos)
        for length in lengths:
            vlist = loads(dbfile.read(length) + b("."))
            vdict = dict((names[i], vlist[i]) for i in xrange(len(vlist))
                     if vlist[i] is not None)
            yield vdict
Example #23
0
    def __iter__(self):
        dbfile = self.dbfile
        names = self.names
        lengths = array("I")

        dbfile.seek(self.directory_offset)
        for i in xrange(self.length):
            dbfile.seek(_LONG_SIZE, 1)
            lengths.append(dbfile.read_uint())

        dbfile.seek(self.basepos)
        for length in lengths:
            vlist = loads(dbfile.read(length) + b("."))
            vdict = dict((names[i], vlist[i]) for i in xrange(len(vlist))
                         if vlist[i] is not None)
            yield vdict
Example #24
0
    def read_ids(self):
        dataoffset = self.dataoffset
        ids_string = self.postfile.map[dataoffset:dataoffset + self.idslen]
        if self.compression:
            ids_string = decompress(ids_string)

        if self.stringids:
            ids = loads(ids_string)
        else:
            ids = array(self.typecode)
            ids.fromstring(ids_string)
            if not IS_LITTLE:
                ids.byteswap()

        self.ids = ids
        return ids
Example #25
0
    def read_values(self):
        postingsize = self.postingsize
        if postingsize == 0:
            values = [None] * self.postcount
        else:
            offset = self.dataoffset + self.idslen + self.weightslen
            values_string = self.postfile.map[offset:self.nextoffset]
            if self.compression:
                values_string = decompress(values_string)
            if postingsize < 0:
                values = loads(values_string)
            else:
                values = [values_string[i:i + postingsize]
                          for i in xrange(0, len(values_string), postingsize)]

        self.values = values
        return values
Example #26
0
    def read_values(self):
        postingsize = self.postingsize
        if postingsize == 0:
            values = [None] * self.postcount
        else:
            offset = self.dataoffset + self.idslen + self.weightslen
            values_string = self.postfile.map[offset:self.nextoffset]
            if self.compression:
                values_string = decompress(values_string)
            if postingsize < 0:
                values = loads(values_string)
            else:
                values = [
                    values_string[i:i + postingsize]
                    for i in xrange(0, len(values_string), postingsize)
                ]

        self.values = values
        return values
Example #27
0
def test_pickleability():
    # Ignore base classes
    ignore = (columns.Column, columns.WrappedColumn, columns.ListColumn)
    # Required arguments
    init_args = {"ClampedNumericColumn": (columns.NumericColumn("B"),),
                 "FixedBytesColumn": (5,),
                 "FixedBytesListColumn": (5,),
                 "NumericColumn": ("i",),
                 "PickleColumn": (columns.VarBytesColumn(),),
                 "StructColumn": ("=if", (0, 0.0)),
                 }

    coltypes = [c for _, c in inspect.getmembers(columns, inspect.isclass)
                if issubclass(c, columns.Column) and not c in ignore]

    for coltype in coltypes:
        args = init_args.get(coltype.__name__, ())
        try:
            inst = coltype(*args)
        except TypeError:
            e = sys.exc_info()[1]
            raise TypeError("Error instantiating %r: %s" % (coltype, e))
        _ = loads(dumps(inst, -1))
Example #28
0
 def __iter__(self):
     for v in self._child:
         if not v:
             yield None
         else:
             yield loads(v)
 def __iter__(self):
     for v in self._child:
         if not v:
             yield None
         else:
             yield loads(v)
 def __getitem__(self, docnum):
     v = self._child[docnum]
     if not v:
         return None
     else:
         return loads(v)
Example #31
0
 def __getitem__(self, docnum):
     v = self._child[docnum]
     if not v:
         return None
     else:
         return loads(v)