def __commit_terms(self, batch_size=64): term_store = leveldb.LevelDB("%s/term.db" % self.data_dir) batch = [] term_id = 0 batch_key = 0 while term_id < len(self.term_id_map): batch.append(self.id_term_map[term_id]) if term_id % batch_size == batch_size - 1: batch_data = self.compressHC(pickle.dumps(batch)) term_store.Put(numencode.encode_uint(batch_key), batch_data) batch = [] batch_key += 1 term_id += 1 if len(batch) > 0: batch_data = self.compressHC(pickle.dumps(batch)) term_store.Put(numencode.encode_uint(batch_key), batch_data)
def write_terms(self, id_term_map, batch_size=64): term_store = leveldb.LevelDB("%s/term.db" % self.data_dir) batch = [] term_id = 0 batch_key = 0 while term_id < len(id_term_map): batch.append(id_term_map[term_id]) if term_id % batch_size == batch_size - 1: batch_data = self.compressHC(pickle.dumps(batch)) term_store.Put(numencode.encode_uint(batch_key), batch_data) batch = [] batch_key += 1 term_id += 1 if len(batch) > 0: batch_data = self.compressHC(pickle.dumps(batch)) term_store.Put(numencode.encode_uint(batch_key), batch_data) logging.info("wrote %d terms" % len(id_term_map))
def write_objects(self, id_object_map): object_store = leveldb.LevelDB("%s/object.db" % self.data_dir) w_batch = leveldb.WriteBatch() for obj_id, obj in id_object_map: obj_str = self.obj_to_str(obj) obj_blob = self.compressHC(obj_str) obj_key = numencode.encode_uint(obj_id) w_batch.Put(obj_key, obj_blob) object_store.Write(w_batch, sync=True) logging.info("wrote %d objects" % len(id_object_map)) self.update_objnum(self.objnum)
def __update_arg_index(self): w_batch = leveldb.WriteBatch() arg_index = leveldb.LevelDB("%s/arg.index" % self.data_dir) for term_id, plist in self.arg_cache.iteritems(): term_key = numencode.encode_uint(term_id) try: old_plist_blob = arg_index.Get(term_key) except KeyError: old_plist_blob = None if old_plist_blob is None: plist_blob = self.encode_posting_list(plist) else: plist_blob = self.update_posting_list(old_plist_blob, plist) w_batch.Put(term_key, plist_blob) arg_index.Write(w_batch, sync=True)
def search(self, rel_type=None, arg_query=()): norm_query = [] for arg in arg_query: if isinstance(arg, list) or isinstance(arg, tuple): term, pos = arg if isinstance(term, basestring): if isinstance(term, unicode): term = term.encode("utf-8") term_id = self.term_id_map.get(term) else: term_id = term elif isinstance(arg, basestring): term, pos = arg, -1 if isinstance(term, unicode): term = term.encode("utf-8") term_id = self.term_id_map.get(term) elif isinstance(arg, int): term_id, pos = arg, -1 else: term_id, pos = None, -1 if term_id is not None and term_id in self.id_term_map: norm_query.append((term_id, pos)) results = None for term_id, pos in norm_query: try: plist_blob = self.arg_index.Get(numencode.encode_uint(term_id)) plist = self.index.decode_posting_list(plist_blob) except KeyError: plist = [] if pos != -1: plist = filter(lambda plist_el: plist_el[1] == pos, plist) plist = [plist_el[0] for plist_el in plist] plist = set(plist) if results is None: results = plist else: results &= plist if results is None: return () results = [self.id_triple_map[triple_id] for triple_id in results] if rel_type is not None: results = filter(lambda triple: triple[0] == rel_type, results) return results
def update_posting_lists(self, post_lists): plist_store = leveldb.LevelDB("%s/plist.index" % self.data_dir) w_batch = leveldb.WriteBatch() upd_num = 0 new_num = 0 for term_id, plist in post_lists.iteritems(): term_key = numencode.encode_uint(term_id) try: old_plist_blob = plist_store.Get(term_key) upd_num += 1 except KeyError: new_num += 1 old_plist_blob = None if old_plist_blob is None: plist_blob = self.encode_posting_list(plist) else: plist_blob = self.update_posting_list(old_plist_blob, plist) w_batch.Put(term_key, plist_blob) plist_store.Write(w_batch, sync=True) logging.info("updated %d plists, %d new" % (upd_num, new_num))
def load_object(self, obj_id, obj_store): obj_key = numencode.encode_uint(obj_id) obj_blob = obj_store.Get(obj_key) obj_str = self.decompress(obj_blob) obj = self.str_to_obj(obj_str) return obj
def load_posting_list(self, term_id, plist_store): term_key = numencode.encode_uint(term_id) plist_blob = plist_store.Get(term_key) plist = self.decode_posting_list(plist_blob) return plist