def to_file(self, file, stringids=False): flags = 1 self._blockstart = file.tell() self._pointer_pos = self._blockstart + 4 file.write( self._struct.pack( flags, 0, 0, # unused B, H self.nextoffset, self.idslen, self.weightslen, self.postcount, self.maxweight, self.maxwol, 0, length_to_byte(self.minlength))) # Write the maximum ID after the header. We have to do this # separately because it might be a string (in the case of a vector) if stringids: file.write_string(utf8encode(self.maxid)[0]) else: file.write_uint(self.maxid)
def _write_block(self): posting_size = self.format.posting_size stringids = self.stringids pf = self.postfile ids = self.blockids values = self.blockvalues postcount = len(ids) if stringids: pf.write_string(utf8encode(ids[-1])[0]) else: pf.write_uint(ids[-1]) startoffset = pf.tell() # Place holder for pointer to next block pf.write_uint(0) # Write the number of postings in this block pf.write_byte(postcount) if stringids: for id in ids: pf.write_string(utf8encode(id)[0]) else: pf.write_array(ids) if posting_size < 0: # Write array of value lengths lengths = array("I") for valuestring in values: lengths.append(len(valuestring)) pf.write_array(lengths) if posting_size != 0: pf.write("".join(values)) # Seek back and write the pointer to the next block pf.flush() nextoffset = pf.tell() pf.seek(startoffset) pf.write_uint(nextoffset) pf.seek(nextoffset) self.posttotal += postcount self._reset_block() self.blockcount += 1
def encode_posting(fieldNum, text, doc, freq, datastring): """Encodes a posting as a string, for sorting.""" return "".join([ pack_ushort(fieldNum), utf8encode(text)[0], chr(0), pack2ints(doc, freq), datastring ])
def encode_posting(fieldNum, text, doc, freq, datastring): """Encodes a posting as a string, for sorting.""" return "".join([pack_ushort(fieldNum), utf8encode(text)[0], chr(0), pack2ints(doc, freq), datastring ])
def keycoder(self, key): # Encode term fieldmap = self.fieldmap fieldname, text = key if fieldname in fieldmap: fieldnum = fieldmap[fieldname] else: fieldnum = self.fieldcounter fieldmap[fieldname] = fieldnum self.fieldcounter += 1 key = pack_ushort(fieldnum) + utf8encode(text)[0] return key
def _write_node(self, dbfile, node): keys = node._edges.keys() ptrs = array("I") for key in keys: sn = node._edges[key] if id(sn) in self.offsets: ptrs.append(self.offsets[id(sn)]) else: ptr = self._write_node(dbfile, sn) self.offsets[id(sn)] = ptr ptrs.append(ptr) start = dbfile.tell() # The low bit indicates whether this node represents the end of a word flags = int(node.final) # The second lowest bit = whether this node has children flags |= bool(keys) << 1 # The third lowest bit = whether all keys are single chars singles = all(len(k) == 1 for k in keys) flags |= singles << 2 # The fourth lowest bit = whether all keys are one byte if singles: sbytes = all(ord(key) <= 255 for key in keys) flags |= sbytes << 3 dbfile.write_byte(flags) if keys: dbfile.write_varint(len(keys)) dbfile.write_array(ptrs) if singles: for key in keys: o = ord(key) if sbytes: dbfile.write_byte(o) else: dbfile.write_ushort(o) else: for key in keys: dbfile.write_string(utf8encode(key)[0]) return start
def to_file(self, file, stringids=False): flags = 1 self._blockstart = file.tell() self._pointer_pos = self._blockstart + 4 file.write(self._struct.pack(flags, 0, 0, # unused B, H self.nextoffset, self.idslen, self.weightslen, self.postcount, self.maxweight, self.maxwol, 0, length_to_byte(self.minlength))) # Write the maximum ID after the header. We have to do this # separately because it might be a string (in the case of a vector) if stringids: file.write_string(utf8encode(self.maxid)[0]) else: file.write_uint(self.maxid)
def to_labels(key): """Takes a string and returns a list of bytestrings, suitable for use as a key or path in an FSA/FST graph. """ # Convert to tuples of bytestrings (must be tuples so they can be hashed) keytype = type(key) # I hate the Python 3 bytes object so friggin much if keytype is tuple or keytype is list: if not all(isinstance(e, bytes_type) for e in key): raise TypeError("%r contains a non-bytestring") if keytype is list: key = tuple(key) elif isinstance(key, bytes_type): key = tuple(key[i:i + 1] for i in xrange(len(key))) elif isinstance(key, text_type): key = tuple(utf8encode(key[i:i + 1])[0] for i in xrange(len(key))) else: raise TypeError("Don't know how to convert %r" % key) return key
def encode_termkey(term): fieldnum, text = term return packushort(fieldnum) + utf8encode(text)[0]
def keycoder(self, key): fieldname, text = key fnum = self.fieldmap.get(fieldname, 65535) return pack_ushort(fnum) + utf8encode(text)[0]
def _write_block(self): posting_size = self.format.posting_size stringids = self.stringids pf = self.postfile ids = self.blockids values = self.blockvalues weights = self.blockweights postcount = len(ids) # Only compress when there are more than 4 postings in the block compressed = self.compressed and postcount > 4 compression = self.compression # Calculate block statistics maxid = ids[-1] maxweight = max(weights) maxwol = 0.0 minlength = 0 if self.blocklengths: minlength = min(self.blocklengths) maxwol = max(w / l for w, l in zip(weights, self.blocklengths)) # Compress IDs if necessary if not stringids and compressed: if IS_LITTLE: ids.byteswap() compressed_ids = compress(ids.tostring(), compression) idslen = len(compressed_ids) else: idslen = 0 # Compress weights if necessary if all(w == 1.0 for w in weights): weightslen = 1 if compressed: if IS_LITTLE: weights.byteswap() compressed_weights = compress(weights.tostring(), compression) weightslen = len(compressed_weights) else: weightslen = 0 # Write the blockinfo blockinfo = BlockInfo(nextoffset=0, maxweight=maxweight, maxwol=maxwol, minlength=minlength, postcount=postcount, maxid=maxid, idslen=idslen, weightslen=weightslen) blockinfo.to_file(pf, stringids) # Write the IDs if stringids: for id in ids: pf.write_string(utf8encode(id)[0]) elif idslen: pf.write(compressed_ids) else: pf.write_array(ids) # Write the weights if weightslen == 1: pass if compressed: pf.write(compressed_weights) else: pf.write_array(weights) # Write the values if posting_size != 0: values_string = "" # If the size of a posting value in this format is not fixed # (represented by a number less than zero), write an array of value # lengths if posting_size < 0: lengths = array("i", (len(valuestring) for valuestring in values)) values_string += lengths.tostring() values_string += "".join(values) if compressed: values_string = compress(values_string, compression) pf.write(values_string) # Seek back and write the pointer to the next block pf.flush() blockinfo.write_pointer(pf) self._reset_block() self.blockcount += 1
def pickled_unicode(u): # Returns the unicode string as a pickle protocol 2 operator return "X%s%s" % (pack_int_le(len(u)), utf8encode(u)[0])
def _write_block(self): posting_size = self.format.posting_size stringids = self.stringids pf = self.postfile ids = self.blockids values = self.blockvalues weights = self.blockweights postcount = len(ids) # Only compress when there are more than 4 postings in the block compressed = self.compressed and postcount > 4 compression = self.compression if not stringids and compressed: compressed_ids = compress(ids.tostring(), compression) idslen = len(compressed_ids) else: idslen = 0 if compressed: compressed_weights = compress(weights.tostring(), compression) weightslen = len(compressed_weights) else: weightslen = 0 # TODO: THIS IS NOT CROSS-PLATFORM BECAUSE YOU ARE CONVERTING AN ARRAY # TO/FROM A STRING WITHOUT REGARD TO ENDIAN-NESS!!! # Write the blockinfo maxid = ids[-1] maxweight = max(weights) maxwol = 0.0 minlength = 0 if self.blocklengths: minlength = min(self.blocklengths) maxwol = max(w / l for w, l in zip(weights, self.blocklengths)) blockinfo = BlockInfo(nextoffset=0, maxweight=maxweight, maxwol=maxwol, minlength=minlength, postcount=postcount, maxid=maxid, idslen=idslen, weightslen=weightslen) blockinfo.to_file(pf, stringids) # Write the IDs if stringids: for id in ids: pf.write_string(utf8encode(id)[0]) elif idslen: pf.write(compressed_ids) else: pf.write_array(ids) # Write the weights if compressed: pf.write(compressed_weights) else: pf.write_array(weights) # Write the values if posting_size != 0: values_string = "" # If the size of a posting value in this format is not fixed # (represented by a number less than zero), write an array of value # lengths if posting_size < 0: lengths = array("i", (len(valuestring) for valuestring in values)) values_string += lengths.tostring() values_string += "".join(values) if compressed: values_string = compress(values_string, compression) pf.write(values_string) # Seek back and write the pointer to the next block pf.flush() blockinfo.write_pointer(pf) self.posttotal += postcount self._reset_block() self.blockcount += 1