def __init__(self, dbfile, offset, expand=True): self.id = offset self.dbfile = dbfile dbfile.seek(offset) flags = dbfile.read_byte() self.final = bool(flags & 1) self._edges = {} if flags & 2: singles = flags & 4 bytes = flags & 8 nkeys = dbfile.read_varint() ptrs = dbfile.read_array("I", nkeys) for i in xrange(nkeys): ptr = ptrs[i] if singles: if bytes: charnum = dbfile.read_byte() else: charnum = dbfile.read_ushort() self._edges[unichr(charnum)] = ptr else: key = utf8decode(dbfile.read_string())[0] if len(key) > 1 and expand: self._edges[key[0]] = PatNode(dbfile, key[1:], ptr) else: self._edges[key] = ptr
def from_file(file, stringids=False): here = file.tell() encoded_header = file.read(BlockInfo._struct.size) header = BlockInfo._struct.unpack(encoded_header) (flags, _, _, nextoffset, idslen, weightslen, postcount, maxweight, maxwol, _, minlength) = header if not flags: nextoffset = unpack_long(encoded_header[:8]) else: nextoffset = here + nextoffset assert postcount > 0 minlength = byte_to_length(minlength) if stringids: maxid = utf8decode(file.read_string())[0] else: maxid = file.read_uint() dataoffset = file.tell() return BlockInfo(flags=flags, nextoffset=nextoffset, postcount=postcount, maxweight=maxweight, maxwol=maxwol, maxid=maxid, minlength=minlength, dataoffset=dataoffset, idslen=idslen, weightslen=weightslen)
def _read_ids(self, offset, postcount): pf = self.postfile if self.stringids: pf.seek(offset) rs = pf.read_string ids = [utf8decode(rs())[0] for _ in xrange(postcount)] offset = pf.tell() else: ids = pf.get_array(offset, "I", postcount) offset += _INT_SIZE * postcount return (ids, offset)
def _read_ids(self, offset, postcount, idslen): pf = self.postfile pf.seek(offset) if self.stringids: rs = pf.read_string ids = [utf8decode(rs())[0] for _ in xrange(postcount)] newoffset = pf.tell() elif idslen: ids = array("I") ids.fromstring(decompress(pf.read(idslen))) newoffset = offset + idslen else: ids = pf.read_array("I", postcount) newoffset = offset + _INT_SIZE * postcount return (ids, newoffset)
def decode_posting(posting): """Decodes an encoded posting string into a (field_number, text, document_number, datastring) tuple. """ fieldnum = unpack_ushort(posting[:_USHORT_SIZE])[0] zero = posting.find(chr(0), _USHORT_SIZE) text = utf8decode(posting[_USHORT_SIZE:zero])[0] metastart = zero + 1 metaend = metastart + _INT_SIZE * 2 doc, freq = unpack2ints(posting[metastart:metaend]) datastring = posting[metaend:] return fieldnum, text, doc, freq, datastring
def _read_block_header(self, offset): pf = self.postfile if self.stringids: pf.seek(offset) maxid = utf8decode(pf.read_string())[0] offset = pf.tell() else: maxid = pf.get_uint(offset) offset = offset + _INT_SIZE nextoffset = pf.get_uint(offset) offset += _INT_SIZE postcount = pf.get_byte(offset) assert postcount > 0 offset += 1 return (maxid, nextoffset, postcount, offset)
def from_file(cls, postfile, stringids=False): pos = postfile.tell() block = cls(postfile, stringids=stringids) block.postfile = postfile header = cls._struct.unpack(postfile.read(cls._struct.size)) block.nextoffset = pos + header[3] block.idslen = header[4] block.wtslen = header[5] block.count = header[6] block.maxweight = header[7] block.minlength = byte_to_length(header[10]) if stringids: block.maxid = utf8decode(postfile.read_string())[0] else: block.maxid = postfile.read_uint() block.dataoffset = postfile.tell() return block
def from_file(cls, postfile, stringids=False): pos = postfile.tell() block = cls(postfile, stringids=stringids) encoded_header = postfile.read(cls._struct.size) header = cls._struct.unpack(encoded_header) (flags, _, _, nextoffset, block.idslen, block.weightslen, block.postcount, block.maxweight, block.maxwol, _, minlength) = header block.nextoffset = pos + nextoffset block.minlength = byte_to_length(minlength) assert block.postcount > 0, "postcount=%r" % block.postcount if stringids: block.maxid = utf8decode(postfile.read_string())[0] else: block.maxid = postfile.read_uint() block.dataoffset = postfile.tell() return block
def read_ids(self): postfile = self.postfile offset = self.dataoffset postcount = self.count postfile.seek(offset) if self.stringids: rs = postfile.read_string ids = [utf8decode(rs())[0] for _ in xrange(postcount)] newoffset = postfile.tell() elif self.idslen: ids = array("I") array_frombytes(ids, decompress(postfile.read(self.idslen))) if IS_LITTLE: ids.byteswap() newoffset = offset + self.idslen else: ids = postfile.read_array("I", postcount) newoffset = offset + _INT_SIZE * postcount self.ids = ids self.weights_offset = newoffset return ids
def read_ids(self): postfile = self.postfile offset = self.dataoffset postcount = self.postcount postfile.seek(offset) if self.stringids: rs = postfile.read_string ids = [utf8decode(rs())[0] for _ in xrange(postcount)] newoffset = postfile.tell() elif self.idslen: ids = array("I") ids.fromstring(decompress(postfile.read(self.idslen))) if IS_LITTLE: ids.byteswap() newoffset = offset + self.idslen else: ids = postfile.read_array("I", postcount) newoffset = offset + _INT_SIZE * postcount self.ids = ids self.weights_offset = newoffset return ids
def keydecoder(self, v): assert isinstance(v, bytes_type) return (self.names[unpack_ushort(v[:2])[0]], utf8decode(v[2:])[0])
def within(graph, text, k=1, prefix=0, address=None): """Yields a series of keys in the given graph within ``k`` edit distance of ``text``. If ``prefix`` is greater than 0, all keys must match the first ``prefix`` characters of ``text``. """ text = to_labels(text) if address is None: address = graph._root sofar = emptybytes accept = False if prefix: prefixchars = text[:prefix] arc = graph.find_path(prefixchars, address=address) if arc is None: return sofar = emptybytes.join(prefixchars) address = arc.target accept = arc.accept stack = [(address, k, prefix, sofar, accept)] seen = set() while stack: state = stack.pop() # Have we already tried this state? if state in seen: continue seen.add(state) address, k, i, sofar, accept = state # If we're at the end of the text (or deleting enough chars would get # us to the end and still within K), and we're in the accept state, # yield the current result if (len(text) - i <= k) and accept: yield utf8decode(sofar)[0] # If we're in the stop state, give up if address is None: continue # Exact match if i < len(text): arc = graph.find_arc(address, text[i]) if arc: stack.append((arc.target, k, i + 1, sofar + text[i], arc.accept)) # If K is already 0, can't do any more edits if k < 1: continue k -= 1 arcs = graph.arc_dict(address) # Insertions stack.extend((arc.target, k, i, sofar + char, arc.accept) for char, arc in iteritems(arcs)) # Deletion, replacement, and transpo only work before the end if i >= len(text): continue char = text[i] # Deletion stack.append((address, k, i + 1, sofar, False)) # Replacement for char2, arc in iteritems(arcs): if char2 != char: stack.append((arc.target, k, i + 1, sofar + char2, arc.accept)) # Transposition if i < len(text) - 1: char2 = text[i + 1] if char != char2 and char2 in arcs: # Find arc from next char to this char target = arcs[char2].target if target: arc = graph.find_arc(target, char) if arc: stack.append((arc.target, k, i + 2, sofar + char2 + char, arc.accept))
def prefix_string(self): """Returns the labels of the path from the root to the current arc as a decoded unicode string. """ return utf8decode(self.prefix_bytes())[0]
def peek_key_string(self): """Returns the next closest key in the graph as a decoded unicode string. """ return utf8decode(self.peek_key_bytes())[0]
def keydecoder(self, v): return (self.names[unpack_ushort(v[:2])[0]], utf8decode(v[2:])[0])
def flatten_strings(self): return (utf8decode(k)[0] for k in self.flatten())
def keydecoder(self, v): if isinstance(v, text_type): v = v.encode('latin-1') return (self.names[unpack_ushort(v[:2])[0]], utf8decode(v[2:])[0])
def decode_termkey(key): return unpackushort(key[:_USHORT_SIZE]), utf8decode(key[_USHORT_SIZE:])[0]
def within(graph, text, k=1, prefix=0, address=None): """Yields a series of keys in the given graph within ``k`` edit distance of ``text``. If ``prefix`` is greater than 0, all keys must match the first ``prefix`` characters of ``text``. """ text = to_labels(text) if address is None: address = graph._root sofar = emptybytes accept = False if prefix: prefixchars = text[:prefix] arc = graph.find_path(prefixchars, address=address) if arc is None: return sofar = emptybytes.join(prefixchars) address = arc.target accept = arc.accept stack = [(address, k, prefix, sofar, accept)] seen = set() while stack: state = stack.pop() # Have we already tried this state? if state in seen: continue seen.add(state) address, k, i, sofar, accept = state # If we're at the end of the text (or deleting enough chars would get # us to the end and still within K), and we're in the accept state, # yield the current result if (len(text) - i <= k) and accept: yield utf8decode(sofar)[0] # If we're in the stop state, give up if address is None: continue # Exact match if i < len(text): arc = graph.find_arc(address, text[i]) if arc: stack.append( (arc.target, k, i + 1, sofar + text[i], arc.accept)) # If K is already 0, can't do any more edits if k < 1: continue k -= 1 arcs = graph.arc_dict(address) # Insertions stack.extend((arc.target, k, i, sofar + char, arc.accept) for char, arc in iteritems(arcs)) # Deletion, replacement, and transpo only work before the end if i >= len(text): continue char = text[i] # Deletion stack.append((address, k, i + 1, sofar, False)) # Replacement for char2, arc in iteritems(arcs): if char2 != char: stack.append((arc.target, k, i + 1, sofar + char2, arc.accept)) # Transposition if i < len(text) - 1: char2 = text[i + 1] if char != char2 and char2 in arcs: # Find arc from next char to this char target = arcs[char2].target if target: arc = graph.find_arc(target, char) if arc: stack.append((arc.target, k, i + 2, sofar + char2 + char, arc.accept))