コード例 #1
0
ファイル: whoosh3.py プロジェクト: cylleneus/cylleneus
    def to_bytes(self):
        isinlined = self.is_inlined()

        # Encode the lengths as 0-255 values
        minlength = (
            0 if self._minlength is None else length_to_byte(self._minlength)
        )
        maxlength = length_to_byte(self._maxlength)
        # Convert None values to the out-of-band NO_ID constant so they can be
        # stored as unsigned ints
        minid = 0xFFFFFFFF if self._minid is None else self._minid
        maxid = 0xFFFFFFFF if self._maxid is None else self._maxid

        # Pack the term info into bytes
        st = self._struct.pack(
            isinlined,
            self._weight,
            self._df,
            minlength,
            maxlength,
            self._maxweight,
            minid,
            maxid,
        )

        if isinlined:
            # Postings are inlined - dump them using the pickle protocol
            postbytes = dumps(self._inlined, 2)
        else:
            postbytes = pack_long(self._offset) + pack_int(self._length)
        st += postbytes
        return st
コード例 #2
0
ファイル: whoosh3.py プロジェクト: baeggot/plateshare
    def _write_block(self, last=False):
        # Write the buffered block to the postings file

        # If this is the first block, write a small header first
        if not self._blockcount:
            self._postfile.write(WHOOSH3_HEADER_MAGIC)

        # Add this block's statistics to the terminfo object, which tracks the
        # overall statistics for all term postings
        self._terminfo.add_block(self)

        # Minify the IDs, weights, and values, and put them in a tuple
        data = (self._mini_ids(), self._mini_weights(), self._mini_values())
        # Pickle the tuple
        databytes = dumps(data)
        # If the pickle is less than 20 bytes, don't bother compressing
        if len(databytes) < 20:
            comp = 0
        # Compress the pickle (if self._compression > 0)
        comp = self._compression
        if comp:
            databytes = zlib.compress(databytes, comp)

        # Make a tuple of block info. The posting reader can check this info
        # and decide whether to skip the block without having to decompress the
        # full block data
        #
        # - Number of postings in block
        # - Last ID in block
        # - Maximum weight in block
        # - Compression level
        # - Minimum length byte
        # - Maximum length byte
        ids = self._ids
        infobytes = dumps((
            len(ids),
            ids[-1],
            self._maxweight,
            comp,
            length_to_byte(self._minlength),
            length_to_byte(self._maxlength),
        ))

        # Write block length
        postfile = self._postfile
        blocklength = len(infobytes) + len(databytes)
        if last:
            # If this is the last block, use a negative number
            blocklength *= -1
        postfile.write_int(blocklength)
        # Write block info
        postfile.write(infobytes)
        # Write block data
        postfile.write(databytes)

        self._blockcount += 1
        # Reset block buffer
        self._new_block()
コード例 #3
0
    def _write_block(self, last=False):
        # Write the buffered block to the postings file

        # If this is the first block, write a small header first
        if not self._blockcount:
            self._postfile.write(WHOOSH3_HEADER_MAGIC)

        # Add this block's statistics to the terminfo object, which tracks the
        # overall statistics for all term postings
        self._terminfo.add_block(self)

        # Minify the IDs, weights, and values, and put them in a tuple
        data = (self._mini_ids(), self._mini_weights(), self._mini_values())
        # Pickle the tuple
        databytes = dumps(data)
        # If the pickle is less than 20 bytes, don't bother compressing
        if len(databytes) < 20:
            comp = 0
        # Compress the pickle (if self._compression > 0)
        comp = self._compression
        if comp:
            databytes = zlib.compress(databytes, comp)

        # Make a tuple of block info. The posting reader can check this info
        # and decide whether to skip the block without having to decompress the
        # full block data
        #
        # - Number of postings in block
        # - Last ID in block
        # - Maximum weight in block
        # - Compression level
        # - Minimum length byte
        # - Maximum length byte
        ids = self._ids
        infobytes = dumps((len(ids), ids[-1], self._maxweight, comp,
                           length_to_byte(self._minlength),
                           length_to_byte(self._maxlength),
                           ))

        # Write block length
        postfile = self._postfile
        blocklength = len(infobytes) + len(databytes)
        if last:
            # If this is the last block, use a negative number
            blocklength *= -1
        postfile.write_int(blocklength)
        # Write block info
        postfile.write(infobytes)
        # Write block data
        postfile.write(databytes)

        self._blockcount += 1
        # Reset block buffer
        self._new_block()
コード例 #4
0
ファイル: whoosh3.py プロジェクト: baeggot/plateshare
 def add_field(self, fieldname, fieldobj, value, length):
     if value is not None:
         self._storedfields[fieldname] = value
     if length:
         # Add byte to length column
         lenfield = _lenfield(fieldname)
         lb = length_to_byte(length)
         self.add_column_value(lenfield, LENGTHS_COLUMN, lb)
         # Add length to total field length
         self._fieldlengths[fieldname] += length
コード例 #5
0
 def add_field(self, fieldname, fieldobj, value, length):
     if value is not None:
         self._storedfields[fieldname] = value
     if length:
         # Add byte to length column
         lenfield = _lenfield(fieldname)
         lb = length_to_byte(length)
         self.add_column_value(lenfield, LENGTHS_COLUMN, lb)
         # Add length to total field length
         self._fieldlengths[fieldname] += length
コード例 #6
0
    def to_bytes(self):
        isinlined = self.is_inlined()

        # Encode the lengths as 0-255 values
        minlength = (0 if self._minlength is None
                     else length_to_byte(self._minlength))
        maxlength = length_to_byte(self._maxlength)
        # Convert None values to the out-of-band NO_ID constant so they can be
        # stored as unsigned ints
        minid = 0xffffffff if self._minid is None else self._minid
        maxid = 0xffffffff if self._maxid is None else self._maxid

        # Pack the term info into bytes
        st = self._struct.pack(isinlined, self._weight, self._df,
                               minlength, maxlength, self._maxweight,
                               minid, maxid)

        if isinlined:
            # Postings are inlined - dump them using the pickle protocol
            postbytes = dumps(self._inlined, -1)
        else:
            postbytes = pack_long(self._offset) + pack_int(self._length)
        st += postbytes
        return st
コード例 #7
0
ファイル: test_indexing.py プロジェクト: pombredanne/whoosh
def test_many_lengths():
    domain = u("alfa bravo charlie delta echo").split()
    schema = fields.Schema(text=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    for i, word in enumerate(domain):
        length = (i + 1) ** 6
        w.add_document(text=" ".join(word for _ in xrange(length)))
    w.commit()

    s = ix.searcher()
    for i, word in enumerate(domain):
        target = byte_to_length(length_to_byte((i + 1) ** 6))
        ti = s.term_info("text", word)
        assert ti.min_length() == target
        assert ti.max_length() == target
コード例 #8
0
ファイル: test_indexing.py プロジェクト: CuteCha/dssm-theano
def test_many_lengths():
    domain = u("alfa bravo charlie delta echo").split()
    schema = fields.Schema(text=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    for i, word in enumerate(domain):
        length = (i + 1)**6
        w.add_document(text=" ".join(word for _ in xrange(length)))
    w.commit()

    s = ix.searcher()
    for i, word in enumerate(domain):
        target = byte_to_length(length_to_byte((i + 1)**6))
        ti = s.term_info("text", word)
        assert ti.min_length() == target
        assert ti.max_length() == target
コード例 #9
0
ファイル: test_indexing.py プロジェクト: pombredanne/whoosh
def test_lengths():
    s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True),
                      f2=fields.KEYWORD(stored=True, scorable=True))
    with TempIndex(s, "testlengths") as ix:
        w = ix.writer()
        items = u("ABCDEFG")
        from itertools import cycle, islice
        lengths = [10, 20, 2, 102, 45, 3, 420, 2]
        for length in lengths:
            w.add_document(f2=u(" ").join(islice(cycle(items), length)))
        w.commit()

        with ix.reader() as dr:
            ls1 = [dr.doc_field_length(i, "f1")
                   for i in xrange(0, len(lengths))]
            assert ls1 == [0] * len(lengths)
            ls2 = [dr.doc_field_length(i, "f2")
                   for i in xrange(0, len(lengths))]
            assert ls2 == [byte_to_length(length_to_byte(l)) for l in lengths]
コード例 #10
0
ファイル: test_indexing.py プロジェクト: CuteCha/dssm-theano
def test_lengths():
    s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True),
                      f2=fields.KEYWORD(stored=True, scorable=True))
    with TempIndex(s, "testlengths") as ix:
        w = ix.writer()
        items = u("ABCDEFG")
        from itertools import cycle, islice
        lengths = [10, 20, 2, 102, 45, 3, 420, 2]
        for length in lengths:
            w.add_document(f2=u(" ").join(islice(cycle(items), length)))
        w.commit()

        with ix.reader() as dr:
            ls1 = [
                dr.doc_field_length(i, "f1") for i in xrange(0, len(lengths))
            ]
            assert ls1 == [0] * len(lengths)
            ls2 = [
                dr.doc_field_length(i, "f2") for i in xrange(0, len(lengths))
            ]
            assert ls2 == [byte_to_length(length_to_byte(l)) for l in lengths]
コード例 #11
0
ファイル: test_misc.py プロジェクト: JunjieHu/dl
def test_length_byte():
    source = list(range(11))
    xform = [length_to_byte(n) for n in source]
    result = [byte_to_length(n) for n in xform]
    assert source == result
コード例 #12
0
def _byten(n):
    return byte_to_length(length_to_byte(n))
コード例 #13
0
ファイル: test_quality.py プロジェクト: sangensong/whoosh-1
def _discreet(length):
    return byte_to_length(length_to_byte(length))
コード例 #14
0
def test_length_byte():
    source = list(range(11))
    xform = [length_to_byte(n) for n in source]
    result = [byte_to_length(n) for n in xform]
    assert source == result
コード例 #15
0
ファイル: test_quality.py プロジェクト: JunjieHu/dl
def _discreet(length):
    return byte_to_length(length_to_byte(length))