Exemple #1
0
    def to_file(self, file, stringids=False):
        flags = 1

        self._blockstart = file.tell()
        self._pointer_pos = self._blockstart + 4
        file.write(
            self._struct.pack(
                flags,
                0,
                0,  # unused B, H
                self.nextoffset,
                self.idslen,
                self.weightslen,
                self.postcount,
                self.maxweight,
                self.maxwol,
                0,
                length_to_byte(self.minlength)))

        # Write the maximum ID after the header. We have to do this
        # separately because it might be a string (in the case of a vector)
        if stringids:
            file.write_string(utf8encode(self.maxid)[0])
        else:
            file.write_uint(self.maxid)
Exemple #2
0
    def _write_block(self):
        posting_size = self.format.posting_size
        stringids = self.stringids
        pf = self.postfile
        ids = self.blockids
        values = self.blockvalues
        postcount = len(ids)

        if stringids:
            pf.write_string(utf8encode(ids[-1])[0])
        else:
            pf.write_uint(ids[-1])

        startoffset = pf.tell()
        # Place holder for pointer to next block
        pf.write_uint(0)

        # Write the number of postings in this block
        pf.write_byte(postcount)
        if stringids:
            for id in ids:
                pf.write_string(utf8encode(id)[0])
        else:
            pf.write_array(ids)

        if posting_size < 0:
            # Write array of value lengths
            lengths = array("I")
            for valuestring in values:
                lengths.append(len(valuestring))
            pf.write_array(lengths)

        if posting_size != 0:
            pf.write("".join(values))

        # Seek back and write the pointer to the next block
        pf.flush()
        nextoffset = pf.tell()
        pf.seek(startoffset)
        pf.write_uint(nextoffset)
        pf.seek(nextoffset)

        self.posttotal += postcount
        self._reset_block()
        self.blockcount += 1
Exemple #3
0
    def _write_block(self):
        posting_size = self.format.posting_size
        stringids = self.stringids
        pf = self.postfile
        ids = self.blockids
        values = self.blockvalues
        postcount = len(ids)

        if stringids:
            pf.write_string(utf8encode(ids[-1])[0])
        else:
            pf.write_uint(ids[-1])

        startoffset = pf.tell()
        # Place holder for pointer to next block
        pf.write_uint(0)

        # Write the number of postings in this block
        pf.write_byte(postcount)
        if stringids:
            for id in ids:
                pf.write_string(utf8encode(id)[0])
        else:
            pf.write_array(ids)

        if posting_size < 0:
            # Write array of value lengths
            lengths = array("I")
            for valuestring in values:
                lengths.append(len(valuestring))
            pf.write_array(lengths)

        if posting_size != 0:
            pf.write("".join(values))

        # Seek back and write the pointer to the next block
        pf.flush()
        nextoffset = pf.tell()
        pf.seek(startoffset)
        pf.write_uint(nextoffset)
        pf.seek(nextoffset)

        self.posttotal += postcount
        self._reset_block()
        self.blockcount += 1
Exemple #4
0
def encode_posting(fieldNum, text, doc, freq, datastring):
    """Encodes a posting as a string, for sorting."""

    return "".join([
        pack_ushort(fieldNum),
        utf8encode(text)[0],
        chr(0),
        pack2ints(doc, freq), datastring
    ])
def encode_posting(fieldNum, text, doc, freq, datastring):
    """Encodes a posting as a string, for sorting."""
    
    return "".join([pack_ushort(fieldNum),
                    utf8encode(text)[0],
                    chr(0),
                    pack2ints(doc, freq),
                    datastring
                    ])
Exemple #6
0
    def keycoder(self, key):
        # Encode term
        fieldmap = self.fieldmap
        fieldname, text = key

        if fieldname in fieldmap:
            fieldnum = fieldmap[fieldname]
        else:
            fieldnum = self.fieldcounter
            fieldmap[fieldname] = fieldnum
            self.fieldcounter += 1

        key = pack_ushort(fieldnum) + utf8encode(text)[0]
        return key
Exemple #7
0
    def keycoder(self, key):
        # Encode term
        fieldmap = self.fieldmap
        fieldname, text = key

        if fieldname in fieldmap:
            fieldnum = fieldmap[fieldname]
        else:
            fieldnum = self.fieldcounter
            fieldmap[fieldname] = fieldnum
            self.fieldcounter += 1

        key = pack_ushort(fieldnum) + utf8encode(text)[0]
        return key
Exemple #8
0
    def _write_node(self, dbfile, node):
        keys = node._edges.keys()
        ptrs = array("I")
        for key in keys:
            sn = node._edges[key]
            if id(sn) in self.offsets:
                ptrs.append(self.offsets[id(sn)])
            else:
                ptr = self._write_node(dbfile, sn)
                self.offsets[id(sn)] = ptr
                ptrs.append(ptr)

        start = dbfile.tell()

        # The low bit indicates whether this node represents the end of a word
        flags = int(node.final)
        # The second lowest bit = whether this node has children
        flags |= bool(keys) << 1
        # The third lowest bit = whether all keys are single chars
        singles = all(len(k) == 1 for k in keys)
        flags |= singles << 2
        # The fourth lowest bit = whether all keys are one byte
        if singles:
            sbytes = all(ord(key) <= 255 for key in keys)
            flags |= sbytes << 3
        dbfile.write_byte(flags)

        if keys:
            dbfile.write_varint(len(keys))
            dbfile.write_array(ptrs)
            if singles:
                for key in keys:
                    o = ord(key)
                    if sbytes:
                        dbfile.write_byte(o)
                    else:
                        dbfile.write_ushort(o)
            else:
                for key in keys:
                    dbfile.write_string(utf8encode(key)[0])

        return start
Exemple #9
0
    def _write_node(self, dbfile, node):
        keys = node._edges.keys()
        ptrs = array("I")
        for key in keys:
            sn = node._edges[key]
            if id(sn) in self.offsets:
                ptrs.append(self.offsets[id(sn)])
            else:
                ptr = self._write_node(dbfile, sn)
                self.offsets[id(sn)] = ptr
                ptrs.append(ptr)

        start = dbfile.tell()

        # The low bit indicates whether this node represents the end of a word
        flags = int(node.final)
        # The second lowest bit = whether this node has children
        flags |= bool(keys) << 1
        # The third lowest bit = whether all keys are single chars
        singles = all(len(k) == 1 for k in keys)
        flags |= singles << 2
        # The fourth lowest bit = whether all keys are one byte
        if singles:
            sbytes = all(ord(key) <= 255 for key in keys)
            flags |= sbytes << 3
        dbfile.write_byte(flags)

        if keys:
            dbfile.write_varint(len(keys))
            dbfile.write_array(ptrs)
            if singles:
                for key in keys:
                    o = ord(key)
                    if sbytes:
                        dbfile.write_byte(o)
                    else:
                        dbfile.write_ushort(o)
            else:
                for key in keys:
                    dbfile.write_string(utf8encode(key)[0])

        return start
Exemple #10
0
 def to_file(self, file, stringids=False):
     flags = 1
     
     self._blockstart = file.tell()
     self._pointer_pos = self._blockstart + 4
     file.write(self._struct.pack(flags,
                                  0, 0, # unused B, H
                                  self.nextoffset,
                                  self.idslen,
                                  self.weightslen,
                                  self.postcount,
                                  self.maxweight, self.maxwol, 0,
                                  length_to_byte(self.minlength)))
     
     # Write the maximum ID after the header. We have to do this
     # separately because it might be a string (in the case of a vector)
     if stringids:
         file.write_string(utf8encode(self.maxid)[0])
     else:
         file.write_uint(self.maxid)
Exemple #11
0
def to_labels(key):
    """Takes a string and returns a list of bytestrings, suitable for use as
    a key or path in an FSA/FST graph.
    """

    # Convert to tuples of bytestrings (must be tuples so they can be hashed)
    keytype = type(key)

    # I hate the Python 3 bytes object so friggin much
    if keytype is tuple or keytype is list:
        if not all(isinstance(e, bytes_type) for e in key):
            raise TypeError("%r contains a non-bytestring")
        if keytype is list:
            key = tuple(key)
    elif isinstance(key, bytes_type):
        key = tuple(key[i:i + 1] for i in xrange(len(key)))
    elif isinstance(key, text_type):
        key = tuple(utf8encode(key[i:i + 1])[0] for i in xrange(len(key)))
    else:
        raise TypeError("Don't know how to convert %r" % key)
    return key
Exemple #12
0
def to_labels(key):
    """Takes a string and returns a list of bytestrings, suitable for use as
    a key or path in an FSA/FST graph.
    """

    # Convert to tuples of bytestrings (must be tuples so they can be hashed)
    keytype = type(key)

    # I hate the Python 3 bytes object so friggin much
    if keytype is tuple or keytype is list:
        if not all(isinstance(e, bytes_type) for e in key):
            raise TypeError("%r contains a non-bytestring")
        if keytype is list:
            key = tuple(key)
    elif isinstance(key, bytes_type):
        key = tuple(key[i:i + 1] for i in xrange(len(key)))
    elif isinstance(key, text_type):
        key = tuple(utf8encode(key[i:i + 1])[0] for i in xrange(len(key)))
    else:
        raise TypeError("Don't know how to convert %r" % key)
    return key
Exemple #13
0
def encode_termkey(term):
    fieldnum, text = term
    return packushort(fieldnum) + utf8encode(text)[0]
Exemple #14
0
 def keycoder(self, key):
     fieldname, text = key
     fnum = self.fieldmap.get(fieldname, 65535)
     return pack_ushort(fnum) + utf8encode(text)[0]
Exemple #15
0
 def keycoder(self, key):
     fieldname, text = key
     fnum = self.fieldmap.get(fieldname, 65535)
     return pack_ushort(fnum) + utf8encode(text)[0]
Exemple #16
0
    def _write_block(self):
        posting_size = self.format.posting_size
        stringids = self.stringids
        pf = self.postfile
        ids = self.blockids
        values = self.blockvalues
        weights = self.blockweights
        postcount = len(ids)
        # Only compress when there are more than 4 postings in the block
        compressed = self.compressed and postcount > 4
        compression = self.compression

        # Calculate block statistics
        maxid = ids[-1]
        maxweight = max(weights)
        maxwol = 0.0
        minlength = 0
        if self.blocklengths:
            minlength = min(self.blocklengths)
            maxwol = max(w / l for w, l in zip(weights, self.blocklengths))

        # Compress IDs if necessary
        if not stringids and compressed:
            if IS_LITTLE:
                ids.byteswap()
            compressed_ids = compress(ids.tostring(), compression)
            idslen = len(compressed_ids)
        else:
            idslen = 0

        # Compress weights if necessary
        if all(w == 1.0 for w in weights):
            weightslen = 1
        if compressed:
            if IS_LITTLE:
                weights.byteswap()
            compressed_weights = compress(weights.tostring(), compression)
            weightslen = len(compressed_weights)
        else:
            weightslen = 0

        # Write the blockinfo
        blockinfo = BlockInfo(nextoffset=0,
                              maxweight=maxweight,
                              maxwol=maxwol,
                              minlength=minlength,
                              postcount=postcount,
                              maxid=maxid,
                              idslen=idslen,
                              weightslen=weightslen)
        blockinfo.to_file(pf, stringids)

        # Write the IDs
        if stringids:
            for id in ids:
                pf.write_string(utf8encode(id)[0])
        elif idslen:
            pf.write(compressed_ids)
        else:
            pf.write_array(ids)

        # Write the weights
        if weightslen == 1:
            pass
        if compressed:
            pf.write(compressed_weights)
        else:
            pf.write_array(weights)

        # Write the values
        if posting_size != 0:
            values_string = ""

            # If the size of a posting value in this format is not fixed
            # (represented by a number less than zero), write an array of value
            # lengths
            if posting_size < 0:
                lengths = array("i",
                                (len(valuestring) for valuestring in values))
                values_string += lengths.tostring()

            values_string += "".join(values)

            if compressed:
                values_string = compress(values_string, compression)

            pf.write(values_string)

        # Seek back and write the pointer to the next block
        pf.flush()
        blockinfo.write_pointer(pf)

        self._reset_block()
        self.blockcount += 1
Exemple #17
0
def pickled_unicode(u):
    # Returns the unicode string as a pickle protocol 2 operator
    return "X%s%s" % (pack_int_le(len(u)), utf8encode(u)[0])
    def _write_block(self):
        posting_size = self.format.posting_size
        stringids = self.stringids
        pf = self.postfile
        ids = self.blockids
        values = self.blockvalues
        weights = self.blockweights
        postcount = len(ids)
        # Only compress when there are more than 4 postings in the block
        compressed = self.compressed and postcount > 4
        compression = self.compression

        if not stringids and compressed:
            compressed_ids = compress(ids.tostring(), compression)
            idslen = len(compressed_ids)
        else:
            idslen = 0
            
        if compressed:
            compressed_weights = compress(weights.tostring(), compression)
            weightslen = len(compressed_weights)
        else:
            weightslen = 0

        # TODO: THIS IS NOT CROSS-PLATFORM BECAUSE YOU ARE CONVERTING AN ARRAY
        # TO/FROM A STRING WITHOUT REGARD TO ENDIAN-NESS!!!

        # Write the blockinfo
        maxid = ids[-1]
        maxweight = max(weights)
        maxwol = 0.0
        minlength = 0
        if self.blocklengths:
            minlength = min(self.blocklengths)
            maxwol = max(w / l for w, l in zip(weights, self.blocklengths))

        blockinfo = BlockInfo(nextoffset=0, maxweight=maxweight, maxwol=maxwol,
                              minlength=minlength, postcount=postcount,
                              maxid=maxid, idslen=idslen, weightslen=weightslen)
        blockinfo.to_file(pf, stringids)
        
        # Write the IDs
        if stringids:
            for id in ids:
                pf.write_string(utf8encode(id)[0])
        elif idslen:
            pf.write(compressed_ids)
        else:
            pf.write_array(ids)
            
        # Write the weights
        if compressed:
            pf.write(compressed_weights)
        else:
            pf.write_array(weights)

        # Write the values
        if posting_size != 0:
            values_string = ""
            
            # If the size of a posting value in this format is not fixed
            # (represented by a number less than zero), write an array of value
            # lengths
            if posting_size < 0:
                lengths = array("i", (len(valuestring) for valuestring in values))
                values_string += lengths.tostring()
            
            values_string += "".join(values)
            
            if compressed:
                values_string = compress(values_string, compression)
            
            pf.write(values_string)

        # Seek back and write the pointer to the next block
        pf.flush()
        blockinfo.write_pointer(pf)
        
        self.posttotal += postcount
        self._reset_block()
        self.blockcount += 1
Exemple #19
0
    def _write_block(self):
        posting_size = self.format.posting_size
        stringids = self.stringids
        pf = self.postfile
        ids = self.blockids
        values = self.blockvalues
        weights = self.blockweights
        postcount = len(ids)
        # Only compress when there are more than 4 postings in the block
        compressed = self.compressed and postcount > 4
        compression = self.compression

        # Calculate block statistics
        maxid = ids[-1]
        maxweight = max(weights)
        maxwol = 0.0
        minlength = 0
        if self.blocklengths:
            minlength = min(self.blocklengths)
            maxwol = max(w / l for w, l in zip(weights, self.blocklengths))

        # Compress IDs if necessary
        if not stringids and compressed:
            if IS_LITTLE:
                ids.byteswap()
            compressed_ids = compress(ids.tostring(), compression)
            idslen = len(compressed_ids)
        else:
            idslen = 0
        
        # Compress weights if necessary
        if all(w == 1.0 for w in weights):
            weightslen = 1
        if compressed:
            if IS_LITTLE:
                weights.byteswap()
            compressed_weights = compress(weights.tostring(), compression)
            weightslen = len(compressed_weights)
        else:
            weightslen = 0

        # Write the blockinfo
        blockinfo = BlockInfo(nextoffset=0, maxweight=maxweight, maxwol=maxwol,
                              minlength=minlength, postcount=postcount,
                              maxid=maxid, idslen=idslen, weightslen=weightslen)
        blockinfo.to_file(pf, stringids)
        
        # Write the IDs
        if stringids:
            for id in ids:
                pf.write_string(utf8encode(id)[0])
        elif idslen:
            pf.write(compressed_ids)
        else:
            pf.write_array(ids)
            
        # Write the weights
        if weightslen == 1:
            pass
        if compressed:
            pf.write(compressed_weights)
        else:
            pf.write_array(weights)

        # Write the values
        if posting_size != 0:
            values_string = ""
            
            # If the size of a posting value in this format is not fixed
            # (represented by a number less than zero), write an array of value
            # lengths
            if posting_size < 0:
                lengths = array("i", (len(valuestring) for valuestring in values))
                values_string += lengths.tostring()
            
            values_string += "".join(values)
            
            if compressed:
                values_string = compress(values_string, compression)
            
            pf.write(values_string)

        # Seek back and write the pointer to the next block
        pf.flush()
        blockinfo.write_pointer(pf)
        
        self._reset_block()
        self.blockcount += 1
Exemple #20
0
def encode_termkey(term):
    fieldnum, text = term
    return packushort(fieldnum) + utf8encode(text)[0]
Exemple #21
0
def pickled_unicode(u):
    # Returns the unicode string as a pickle protocol 2 operator
    return "X%s%s" % (pack_int_le(len(u)), utf8encode(u)[0])