def __init__(self, directory): self._directory = directory # open the key shelve # Format: # ( matrix, docsets ) # matrix = TermSimilitudeMatrix # docsets = FrozenStringList keyfilename = os.path.join(directory, "compindex.key.bz2") fh = CompressedFile(keyfilename, "rb") matrix, docsets = cPickle.load(fh) fh.close() matrix = TermSimilitudeMatrix.unpickle(matrix) docsets = FrozenStringList.unpickle(docsets) self.matrix, self.docsets = matrix, docsets # see how many id files we have idsfilename = os.path.join(directory, "compindex-*.ids.bz2") filenames = [] for fn in os.listdir(directory): if fn.startswith("compindex-") and \ fn.endswith(".ids.bz2"): filenames.append(fn) self.idfiles_count = len(filenames)
def _get_ids_shelve(self, cual): '''Return the ids index.''' fname = os.path.join(self._directory, "compindex-%02d.ids.bz2" % cual) fh = CompressedFile(fname, "rb") idx = cPickle.load(fh) fh.close() return idx
def decomp(fname): """Decompress a file and return a dict.""" fh = CompressedFile(fname, "rb") # encoding needed for compatibility w/ py2 cPickle idx = pickle.load(fh, encoding="latin1") fh.close() return idx
def create(cls, directory, source): '''Creates the index in the directory. The "source" generates pairs (key, value) to store in the index. The key must be a string, the value can be any hashable Python object. It must return the quantity of pairs indexed. ''' ids_shelf = {} key_shelf = {} ids_cnter = 0 tmp_reverse_id = {} indexed_counter = 0 # fill them for key, value in source: indexed_counter += 1 # process key if not isinstance(key, basestring): raise TypeError("The key must be string or unicode") # docid -> info final if value in tmp_reverse_id: docid = tmp_reverse_id[value] else: docid = ids_cnter tmp_reverse_id[value] = docid ids_cnter += 1 ids_shelf[docid] = value # keys -> docid key_shelf.setdefault(key, set()).add(docid) # save key keyfilename = os.path.join(directory, "easyindex.key.bz2") fh = CompressedFile(keyfilename, "wb") cPickle.dump(key_shelf, fh, 2) fh.close() # split ids_shelf in N dicts of about ~5k entries N = int(round(len(ids_shelf) / 5000.0)) if not N: N = 1 all_idshelves = [{} for i in range(N)] for k,v in ids_shelf.iteritems(): cual = utiles.coherent_hash(k) % N all_idshelves[cual][k] = v # save dict where corresponds for cual, shelf in enumerate(all_idshelves): fname = "easyindex-%03d.ids.bz2" % cual idsfilename = os.path.join(directory, fname) fh = CompressedFile(idsfilename, "wb") cPickle.dump(shelf, fh, 2) fh.close() return indexed_counter
def create(cls, directory, source): '''Creates the index in the directory. The "source" generates pairs (key, value) to store in the index. The key must be a string, the value can be any hashable Python object. It must return the quantity of pairs indexed. ''' ids_shelf = {} key_shelf = {} ids_cnter = 0 tmp_reverse_id = {} indexed_counter = 0 # fill them for key, value in source: indexed_counter += 1 # process key if not isinstance(key, basestring): raise TypeError("The key must be string or unicode") # docid -> info final if value in tmp_reverse_id: docid = tmp_reverse_id[value] else: docid = ids_cnter tmp_reverse_id[value] = docid ids_cnter += 1 ids_shelf[docid] = value # keys -> docid key_shelf.setdefault(key, set()).add(docid) # save key keyfilename = os.path.join(directory, "easyindex.key.bz2") fh = CompressedFile(keyfilename, "wb") cPickle.dump(key_shelf, fh, 2) fh.close() # split ids_shelf in N dicts of about ~5k entries N = int(round(len(ids_shelf) / 5000.0)) if not N: N = 1 all_idshelves = [{} for i in range(N)] for k, v in ids_shelf.iteritems(): cual = utiles.coherent_hash(k) % N all_idshelves[cual][k] = v # save dict where corresponds for cual, shelf in enumerate(all_idshelves): fname = "easyindex-%03d.ids.bz2" % cual idsfilename = os.path.join(directory, fname) fh = CompressedFile(idsfilename, "wb") cPickle.dump(shelf, fh, 2) fh.close() return indexed_counter
def __init__(self, directory): self._directory = directory # open the key shelve keyfilename = os.path.join(directory, "easyindex.key.bz2") fh = CompressedFile(keyfilename, "rb") self.key_shelf = cPickle.load(fh) fh.close() # see how many id files we have filenames = [] for fn in os.listdir(directory): if fn.startswith("easyindex-") and fn.endswith(".ids.bz2"): filenames.append(fn) self.idfiles_count = len(filenames)
def __init__(self, directory): self._directory = directory # open the key shelve keyfilename = os.path.join(directory, "easyindex.key.bz2") fh = CompressedFile(keyfilename, "rb") self.key_shelf = cPickle.load(fh) fh.close() # see how many id files we have idsfilename = os.path.join(directory, "easyindex-*.ids.bz2") filenames = [] for fn in os.listdir(directory): if fn.startswith("easyindex-") and \ fn.endswith(".ids.bz2"): filenames.append(fn) self.idfiles_count = len(filenames)
def create(cls, directory, source): '''Creates the index in the directory. The "source" generates pairs (key, value) to store in the index. The key must be a string, the value can be any hashable Python object. It must return the quantity of pairs indexed. ''' ids_shelf = {} key_shelf = {} ids_cnter = 0 tmp_reverse_id = {} indexed_counter = 0 # fill them for key, value in source: indexed_counter += 1 # process key if not isinstance(key, basestring): raise TypeError("The key must be string or unicode") if '\n' in key: raise ValueError("Key cannot contain newlines") # docid -> info final if value in tmp_reverse_id: docid = tmp_reverse_id[value] else: docid = ids_cnter tmp_reverse_id[value] = docid ids_shelf[docid] = value ids_cnter += 1 # keys -> docid if key in key_shelf: bucket = key_shelf[key] else: # Lets use array, it's more compact in memory, and given that it # should be easy for the caller to remove most repetitions, # it should only get very little overhead # # NOTE: right now, at most one repetition per property is sent # by cdpindex.py bucket = key_shelf[key] = array.array('l') bucket.append(docid) # prepare for serialization: # turn docsets into lists if delta-encoded integers (they're more compressible) print " Delta-encoding index buckets...", sys.stdout.flush() bucket_bytes = 0 bucket_entries = 0 bucket_maxentries = 0 for key, docset in key_shelf.iteritems(): key_shelf[key] = delta_encode(docset) bucket_entries += len(docset) bucket_bytes += len(key_shelf[key]) bucket_maxentries = max(bucket_maxentries, len(docset)) assert delta_decode(key_shelf[key]) == set(docset), \ ("Delta-encoding error", docset) print "done" # print statistics print " Index contains:" print " ", len(key_shelf), "terms" print " ", bucket_entries, "entries" print " ", len(ids_shelf), "documents" print print " ", len(key_shelf) // max(1,len(ids_shelf)), "terms on avg per documents" print print " Bucket bytes", bucket_bytes print " Bucket entries", bucket_entries print " Bucket maximum size", bucket_maxentries print " Avg bytes per entry", (float(bucket_bytes) / max(1,bucket_entries)) # save key # Format: # ( matrix, docsets ) # Putting all keys togeter makes them more compressible. # Sorting them (skeys) further helps. # Joining them in a single string avoids pickling overhead # (50% average with so many small strings) # And keeping them joined in memory (FrozenStringList) helps # avoid referencing overhead. sitems = sorted([ (k.encode("utf8"),v) for k,v in key_shelf.iteritems() ]) assert all('\n' not in k for k,v in sitems), \ "Terms cannot contain newlines" # free the big dict... eats up a lot del key_shelf print " Computing similitude matrix...", sys.stdout.flush() def progress_cb(p): print >> sys.stderr, "\r Computing similitude matrix... %d%%\t" % int(p), sys.stderr.flush() matrix = TermSimilitudeMatrix(map(operator.itemgetter(0), sitems), progress_callback = progress_cb) docsets = FrozenStringList(map(operator.itemgetter(1), sitems)) del sitems print "done" print " Saving:" keyfilename = os.path.join(directory, "compindex.key.bz2") fh = CompressedFile(keyfilename, "wb") cPickle.dump( (matrix.pickle(), docsets.pickle()), fh, 2) print " Uncompressed keystore bytes", fh.tell() fh.close() fh = open(keyfilename, "rb") fh.seek(0,2) print " Final keystore bytes", fh.tell() print fh.close() # split ids_shelf in N dicts of about ~16M pickled data each, # this helps get better compression ratios NB = sum( len(cPickle.dumps(item,2)) for item in ids_shelf.iteritems() ) print " Total docstore bytes", NB N = int((NB + DOCSTORE_BUCKET_SIZE/2) // DOCSTORE_BUCKET_SIZE) if not N: N = 1 print " Docstore buckets", N, "(", NB//N, " bytes per bucket)" all_idshelves = [{} for i in xrange(N)] for k,v in ids_shelf.iteritems(): cual = k % N all_idshelves[cual][k] = v # save dict where corresponds docucomp = 0 doccomp = 0 for cual, shelf in enumerate(all_idshelves): fname = "compindex-%02d.ids.bz2" % cual idsfilename = os.path.join(directory, fname) fh = CompressedFile(idsfilename, "wb") cPickle.dump(shelf, fh, 2) docucomp += fh.tell() fh.close() fh = open(idsfilename, "rb") fh.seek(0,2) doccomp += fh.tell() fh.close() print " Docstore uncompressed bytes", docucomp print " Docstore compressed bytes", doccomp print return indexed_counter