def getIdxPos(token): token = unicode(token, 'utf-8') hTok = func.myHash(token.lower()) enc = func.Encoder(encMode) #print 'ddd' with open(func.PATH + "dict.data", 'rb') as dct: num = bisect_right(lterms, hTok) if num == 0: return None l = fterms[lterms[num - 1]] r = fterms[lterms[num]] #print l, r while l + 1 < r: m = (l + r) / 2 #print enc.unpackTerm(dct, m)[0] #print hTok, enc.unpackTerm(dct, m)[0] if hTok < enc.unpackTerm(dct, m)[0]: r = m else: l = m h, val = enc.unpackTerm(dct, l) #print h, hTok, " OK" if hTok == h: #print "Posss = ", val return val return None
def main(): global encMode with open(func.PATH + "urls.list", 'r') as f: for line in f: urls.append(line) with open(func.PATH + "fastDict.data", 'rb') as fdct: fdct.seek(-1, 2) encMode = ('simple9' if struct.unpack('B', fdct.read(1))[0] == 1 else 'varbyte') size = os.path.getsize(func.PATH + "fastDict.data") / func.TERM_SIZE enc = func.Encoder(encMode) #print encMode i = 0 while i < size: h, pos = enc.unpackTerm(fdct, i) #print h, pos lterms.append(h) fterms[h] = pos i += 1 lterms.append(1 << 63) fterms[1 << 63] = os.path.getsize(func.PATH + "dict.data") / func.TERM_SIZE #print lterms #print fterms while True: try: req = raw_input() if req == "": break compute(req) except (EOFError): break
def getDocs(pos): if pos is None: return Set() #ans = [] #print "Pos = ", pos ans = Set() last = 0 enc = func.Encoder(encMode) with open(func.PATH + "idx.data", 'rb') as idx: ids = enc.unpackIdx(idx, pos) #print "ids = ", pos, ids for x in ids: x += last #ans.append(x) ans.add(x) last = x #print "Ans = ", ans return ans
def main(): with open(func.PATH + "preDict.data", 'rb') as f: f.seek(-1, 2) encMode = ('simple9' if struct.unpack('B', f.read(1))[0] == 1 else 'varbyte') e = func.Encoder(encMode) idx = 0 size = os.path.getsize(func.PATH + "preDict.data") / func.TERM_SIZE while idx < size: h, pos = e.unpackTerm(f, idx) terms[h].append(pos) idx += 1 global enc global denc global fenc enc.changeMode(encMode) fenc.changeMode(encMode) denc.changeMode(encMode) tmp = sorted(terms.items()) for key, value in tmp: optimize(key, value) with open(func.PATH + "fastDict.data", 'ab') as fdct: fdct.write(struct.pack('B', 1 if encMode == 'simple9' else 0))
#!/usr/bin/env python import sys import os import func import struct from collections import defaultdict from collections import OrderedDict PATH = "./files/" terms = defaultdict(list) enc = func.Encoder() denc = func.Encoder() fenc = func.Encoder() encMode = "" prev = 0 def optimize(h, arr): global enc global denc global fenc res = [] last = 0 with open(func.PATH + "preIdx.data") as f: for val in arr: tmp = enc.unpackIdx(f, val) tmp[0] -= last res.extend(tmp)