class SqliteDictTerminateFailTest(unittest.TestCase): """Provide Coverage for SqliteDict.terminate().""" def setUp(self): self.fname = norm_file('tests/db-permdenied/sqlitedict.sqlite') self.db = SqliteDict(filename=self.fname) os.chmod(self.fname, 0o000) os.chmod(os.path.dirname(self.fname), 0o000) def tearDown(self): os.chmod(os.path.dirname(self.fname), 0o700) os.chmod(self.fname, 0o600) os.unlink(self.fname) os.rmdir(os.path.dirname(self.fname)) def test_terminate_cannot_delete(self): # exercise, self.db.terminate() # deletion failed, but no exception raised! # verify, os.chmod(os.path.dirname(self.fname), 0o700) os.chmod(self.fname, 0o600) self.assertTrue(os.path.exists(self.fname))
class SimIndex(gensim.utils.SaveLoad): """ An index of documents. Used internally by SimServer. It uses the Similarity class to persist all document vectors to disk (via mmap). """ def __init__(self, fname, num_features, shardsize=SHARD_SIZE, topsims=TOP_SIMS): """ Spill index shards to disk after every `shardsize` documents. In similarity queries, return only the `topsims` most similar documents. """ self.fname = fname self.shardsize = int(shardsize) self.topsims = int(topsims) self.id2pos = {} # map document id (string) to index position (integer) self.pos2id = {} # reverse mapping for id2pos; redundant, for performance self.id2sims = SqliteDict(self.fname + '.id2sims', journal_mode=JOURNAL_MODE) # precomputed top similar: document id -> [(doc_id, similarity)] self.qindex = gensim.similarities.Similarity(self.fname + '.idx', corpus=None, num_best=None, num_features=num_features, shardsize=shardsize) self.length = 0 def save(self, fname): tmp, self.id2sims = self.id2sims, None super(SimIndex, self).save(fname) self.id2sims = tmp @staticmethod def load(fname): result = gensim.utils.SaveLoad.load(fname) result.fname = fname result.check_moved() result.id2sims = SqliteDict(fname + '.id2sims', journal_mode=JOURNAL_MODE) return result def check_moved(self): output_prefix = self.fname + '.idx' if self.qindex.output_prefix != output_prefix: logger.info("index seems to have moved from %s to %s; updating locations" % (self.qindex.output_prefix, output_prefix)) self.qindex.output_prefix = output_prefix self.qindex.check_moved() def close(self): "Explicitly release important resources (file handles, db, ...)" try: self.id2sims.close() except: pass try: del self.qindex except: pass def terminate(self): """Delete all files created by this index, invalidating `self`. Use with care.""" try: self.id2sims.terminate() except: pass import glob for fname in glob.glob(self.fname + '*'): try: os.remove(fname) logger.info("deleted %s" % fname) except Exception, e: logger.warning("failed to delete %s: %s" % (fname, e)) for val in self.__dict__.keys(): try: delattr(self, val) except: pass
class SimServer(object): """ Top-level functionality for similarity services. A similarity server takes care of:: 1. creating semantic models 2. indexing documents using these models 3. finding the most similar documents in an index. An object of this class can be shared across network via Pyro, to answer remote client requests. It is thread safe. Using a server concurrently from multiple processes is safe for reading = answering similarity queries. Modifying (training/indexing) is realized via locking = serialized internally. """ def __init__(self, basename, use_locks=False): """ All data will be stored under directory `basename`. If there is a server there already, it will be loaded (resumed). The server object is stateless in RAM -- its state is defined entirely by its location. There is therefore no need to store the server object. """ if not os.path.isdir(basename): raise ValueError("%r must be a writable directory" % basename) self.basename = basename self.use_locks = use_locks self.lock_update = threading.RLock() if use_locks else gensim.utils.nocm try: self.fresh_index = SimIndex.load(self.location('index_fresh')) except: logger.debug("starting a new fresh index") self.fresh_index = None try: self.opt_index = SimIndex.load(self.location('index_opt')) except: logger.debug("starting a new optimized index") self.opt_index = None try: self.model = SimModel.load(self.location('model')) except: self.model = None self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE) self.flush(save_index=False, save_model=False, clear_buffer=True) logger.info("loaded %s" % self) def location(self, name): return os.path.join(self.basename, name) @gensim.utils.synchronous('lock_update') def flush(self, save_index=False, save_model=False, clear_buffer=False): """Commit all changes, clear all caches.""" if save_index: if self.fresh_index is not None: self.fresh_index.save(self.location('index_fresh')) if self.opt_index is not None: self.opt_index.save(self.location('index_opt')) if save_model: if self.model is not None: self.model.save(self.location('model')) self.payload.commit() if clear_buffer: if hasattr(self, 'fresh_docs'): try: self.fresh_docs.terminate() # erase all buffered documents + file on disk except: pass self.fresh_docs = SqliteDict(journal_mode=JOURNAL_MODE) # buffer defaults to a random location in temp self.fresh_docs.sync() def close(self): """Explicitly close open file handles, databases etc.""" try: self.payload.close() except: pass try: self.model.close() except: pass try: self.fresh_index.close() except: pass try: self.opt_index.close() except: pass try: self.fresh_docs.terminate() except: pass def __del__(self): """When the server went out of scope, make an effort to close its DBs.""" self.close() @gensim.utils.synchronous('lock_update') def buffer(self, documents): """ Add a sequence of documents to be processed (indexed or trained on). Here, the documents are simply collected; real processing is done later, during the `self.index` or `self.train` calls. `buffer` can be called repeatedly; the result is the same as if it was called once, with a concatenation of all the partial document batches. The point is to save memory when sending large corpora over network: the entire `documents` must be serialized into RAM. See `utils.upload_chunked()`. A call to `flush()` clears this documents-to-be-processed buffer (`flush` is also implicitly called when you call `index()` and `train()`). """ logger.info("adding documents to temporary buffer of %s" % (self)) for doc in documents: docid = doc['id'] # logger.debug("buffering document %r" % docid) if docid in self.fresh_docs: logger.warning("asked to re-add id %r; rewriting old value" % docid) self.fresh_docs[docid] = doc self.fresh_docs.sync() @gensim.utils.synchronous('lock_update') def train(self, corpus=None, method='auto', clear_buffer=True, params=None): """ Create an indexing model. Will overwrite the model if it already exists. All indexes become invalid, because documents in them use a now-obsolete representation. The model is trained on documents previously entered via `buffer`, or directly on `corpus`, if specified. """ if corpus is not None: # use the supplied corpus only (erase existing buffer, if any) self.flush(clear_buffer=True) self.buffer(corpus) if not self.fresh_docs: msg = "train called but no training corpus specified for %s" % self logger.error(msg) raise ValueError(msg) if method == 'auto': numdocs = len(self.fresh_docs) if numdocs < 1000: logging.warning("too few training documents; using simple log-entropy model instead of latent semantic indexing") method = 'logentropy' else: method = 'lsi' if params is None: params = {} self.model = SimModel(self.fresh_docs, method=method, params=params) self.flush(save_model=True, clear_buffer=clear_buffer) @gensim.utils.synchronous('lock_update') def index(self, corpus=None, clear_buffer=True): """ Permanently index all documents previously added via `buffer`, or directly index documents from `corpus`, if specified. The indexing model must already exist (see `train`) before this function is called. """ if not self.model: msg = 'must initialize model for %s before indexing documents' % self.basename logger.error(msg) raise AttributeError(msg) if corpus is not None: # use the supplied corpus only (erase existing buffer, if any) self.flush(clear_buffer=True) self.buffer(corpus) if not self.fresh_docs: msg = "index called but no indexing corpus specified for %s" % self logger.error(msg) raise ValueError(msg) if not self.fresh_index: logger.info("starting a new fresh index for %s" % self) self.fresh_index = SimIndex(self.location('index_fresh'), self.model.num_features) self.fresh_index.index_documents(self.fresh_docs, self.model) if self.opt_index is not None: self.opt_index.delete(self.fresh_docs.keys()) logger.info("storing document payloads") for docid in self.fresh_docs: payload = self.fresh_docs[docid].get('payload', None) if payload is None: # HACK: exit on first doc without a payload (=assume all docs have payload, or none does) break self.payload[docid] = payload self.flush(save_index=True, clear_buffer=clear_buffer) @gensim.utils.synchronous('lock_update') def optimize(self): """ Precompute top similarities for all indexed documents. This speeds up `find_similar` queries by id (but not queries by fulltext). Internally, documents are moved from a fresh index (=no precomputed similarities) to an optimized index (precomputed similarities). Similarity queries always query both indexes, so this split is transparent to clients. If you add documents later via `index`, they go to the fresh index again. To precompute top similarities for these new documents too, simply call `optimize` again. """ if self.fresh_index is None: logger.warning("optimize called but there are no new documents") return # nothing to do! if self.opt_index is None: logger.info("starting a new optimized index for %s" % self) self.opt_index = SimIndex(self.location('index_opt'), self.model.num_features) self.opt_index.merge(self.fresh_index) self.fresh_index.terminate() # delete old files self.fresh_index = None self.flush(save_index=True) @gensim.utils.synchronous('lock_update') def drop_index(self, keep_model=True): """Drop all indexed documents. If `keep_model` is False, also dropped the model.""" modelstr = "" if keep_model else "and model " logger.info("deleting similarity index " + modelstr + "from %s" % self.basename) # delete indexes for index in [self.fresh_index, self.opt_index]: if index is not None: index.terminate() self.fresh_index, self.opt_index = None, None # delete payload if self.payload is not None: self.payload.close() fname = self.location('payload') try: if os.path.exists(fname): os.remove(fname) logger.info("deleted %s" % fname) except Exception, e: logger.warning("failed to delete %s" % fname) self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE) # optionally, delete the model as well if not keep_model and self.model is not None: self.model.close() fname = self.location('model') try: if os.path.exists(fname): os.remove(fname) logger.info("deleted %s" % fname) except Exception, e: logger.warning("failed to delete %s" % fname) self.model = None
class SimIndex(gensim.utils.SaveLoad): """ An index of documents. Used internally by SimServer. It uses the Similarity class to persist all document vectors to disk (via mmap). """ def __init__(self, fname, num_features, shardsize=SHARD_SIZE, topsims=TOP_SIMS): """ Spill index shards to disk after every `shardsize` documents. In similarity queries, return only the `topsims` most similar documents. """ self.fname = fname self.shardsize = int(shardsize) self.topsims = int(topsims) self.id2pos = { } # map document id (string) to index position (integer) self.pos2id = { } # reverse mapping for id2pos; redundant, for performance self.id2sims = SqliteDict( self.fname + '.id2sims', journal_mode=JOURNAL_MODE ) # precomputed top similar: document id -> [(doc_id, similarity)] self.qindex = gensim.similarities.Similarity(self.fname + '.idx', corpus=None, num_best=None, num_features=num_features, shardsize=shardsize) self.length = 0 def save(self, fname): tmp, self.id2sims = self.id2sims, None super(SimIndex, self).save(fname) self.id2sims = tmp @staticmethod def load(fname): result = gensim.utils.SaveLoad.load(fname) result.fname = fname result.check_moved() result.id2sims = SqliteDict(fname + '.id2sims', journal_mode=JOURNAL_MODE) return result def check_moved(self): output_prefix = self.fname + '.idx' if self.qindex.output_prefix != output_prefix: logger.info( "index seems to have moved from %s to %s; updating locations" % (self.qindex.output_prefix, output_prefix)) self.qindex.output_prefix = output_prefix self.qindex.check_moved() def close(self): "Explicitly release important resources (file handles, db, ...)" try: self.id2sims.close() except: pass try: del self.qindex except: pass def terminate(self): """Delete all files created by this index, invalidating `self`. Use with care.""" try: self.id2sims.terminate() except: pass import glob for fname in glob.glob(self.fname + '*'): try: os.remove(fname) logger.info("deleted %s" % fname) except Exception, e: logger.warning("failed to delete %s: %s" % (fname, e)) for val in self.__dict__.keys(): try: delattr(self, val) except: pass
class SimServer(object): """ Top-level functionality for similarity services. A similarity server takes care of:: 1. creating semantic models 2. indexing documents using these models 3. finding the most similar documents in an index. An object of this class can be shared across network via Pyro, to answer remote client requests. It is thread safe. Using a server concurrently from multiple processes is safe for reading = answering similarity queries. Modifying (training/indexing) is realized via locking = serialized internally. """ def __init__(self, basename, use_locks=False): """ All data will be stored under directory `basename`. If there is a server there already, it will be loaded (resumed). The server object is stateless in RAM -- its state is defined entirely by its location. There is therefore no need to store the server object. """ if not os.path.isdir(basename): raise ValueError("%r must be a writable directory" % basename) self.basename = basename self.use_locks = use_locks self.lock_update = threading.RLock( ) if use_locks else gensim.utils.nocm try: self.fresh_index = SimIndex.load(self.location('index_fresh')) except: logger.debug("starting a new fresh index") self.fresh_index = None try: self.opt_index = SimIndex.load(self.location('index_opt')) except: logger.debug("starting a new optimized index") self.opt_index = None try: self.model = SimModel.load(self.location('model')) except: self.model = None self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE) self.flush(save_index=False, save_model=False, clear_buffer=True) logger.info("loaded %s" % self) def location(self, name): return os.path.join(self.basename, name) @gensim.utils.synchronous('lock_update') def flush(self, save_index=False, save_model=False, clear_buffer=False): """Commit all changes, clear all caches.""" if save_index: if self.fresh_index is not None: self.fresh_index.save(self.location('index_fresh')) if self.opt_index is not None: self.opt_index.save(self.location('index_opt')) if save_model: if self.model is not None: self.model.save(self.location('model')) self.payload.commit() if clear_buffer: if hasattr(self, 'fresh_docs'): try: self.fresh_docs.terminate( ) # erase all buffered documents + file on disk except: pass self.fresh_docs = SqliteDict( journal_mode=JOURNAL_MODE ) # buffer defaults to a random location in temp self.fresh_docs.sync() def close(self): """Explicitly close open file handles, databases etc.""" try: self.payload.close() except: pass try: self.model.close() except: pass try: self.fresh_index.close() except: pass try: self.opt_index.close() except: pass try: self.fresh_docs.terminate() except: pass def __del__(self): """When the server went out of scope, make an effort to close its DBs.""" self.close() @gensim.utils.synchronous('lock_update') def buffer(self, documents): """ Add a sequence of documents to be processed (indexed or trained on). Here, the documents are simply collected; real processing is done later, during the `self.index` or `self.train` calls. `buffer` can be called repeatedly; the result is the same as if it was called once, with a concatenation of all the partial document batches. The point is to save memory when sending large corpora over network: the entire `documents` must be serialized into RAM. See `utils.upload_chunked()`. A call to `flush()` clears this documents-to-be-processed buffer (`flush` is also implicitly called when you call `index()` and `train()`). """ logger.info("adding documents to temporary buffer of %s" % (self)) for doc in documents: docid = doc['id'] # logger.debug("buffering document %r" % docid) if docid in self.fresh_docs: logger.warning("asked to re-add id %r; rewriting old value" % docid) self.fresh_docs[docid] = doc self.fresh_docs.sync() @gensim.utils.synchronous('lock_update') def train(self, corpus=None, method='auto', clear_buffer=True, params=None): """ Create an indexing model. Will overwrite the model if it already exists. All indexes become invalid, because documents in them use a now-obsolete representation. The model is trained on documents previously entered via `buffer`, or directly on `corpus`, if specified. """ if corpus is not None: # use the supplied corpus only (erase existing buffer, if any) self.flush(clear_buffer=True) self.buffer(corpus) if not self.fresh_docs: msg = "train called but no training corpus specified for %s" % self logger.error(msg) raise ValueError(msg) if method == 'auto': numdocs = len(self.fresh_docs) if numdocs < 1000: logging.warning( "too few training documents; using simple log-entropy model instead of latent semantic indexing" ) method = 'logentropy' else: method = 'lsi' if params is None: params = {} self.model = SimModel(self.fresh_docs, method=method, params=params) self.flush(save_model=True, clear_buffer=clear_buffer) @gensim.utils.synchronous('lock_update') def index(self, corpus=None, clear_buffer=True): """ Permanently index all documents previously added via `buffer`, or directly index documents from `corpus`, if specified. The indexing model must already exist (see `train`) before this function is called. """ if not self.model: msg = 'must initialize model for %s before indexing documents' % self.basename logger.error(msg) raise AttributeError(msg) if corpus is not None: # use the supplied corpus only (erase existing buffer, if any) self.flush(clear_buffer=True) self.buffer(corpus) if not self.fresh_docs: msg = "index called but no indexing corpus specified for %s" % self logger.error(msg) raise ValueError(msg) if not self.fresh_index: logger.info("starting a new fresh index for %s" % self) self.fresh_index = SimIndex(self.location('index_fresh'), self.model.num_features) self.fresh_index.index_documents(self.fresh_docs, self.model) if self.opt_index is not None: self.opt_index.delete(self.fresh_docs.keys()) logger.info("storing document payloads") for docid in self.fresh_docs: payload = self.fresh_docs[docid].get('payload', None) if payload is None: # HACK: exit on first doc without a payload (=assume all docs have payload, or none does) break self.payload[docid] = payload self.flush(save_index=True, clear_buffer=clear_buffer) @gensim.utils.synchronous('lock_update') def optimize(self): """ Precompute top similarities for all indexed documents. This speeds up `find_similar` queries by id (but not queries by fulltext). Internally, documents are moved from a fresh index (=no precomputed similarities) to an optimized index (precomputed similarities). Similarity queries always query both indexes, so this split is transparent to clients. If you add documents later via `index`, they go to the fresh index again. To precompute top similarities for these new documents too, simply call `optimize` again. """ if self.fresh_index is None: logger.warning("optimize called but there are no new documents") return # nothing to do! if self.opt_index is None: logger.info("starting a new optimized index for %s" % self) self.opt_index = SimIndex(self.location('index_opt'), self.model.num_features) self.opt_index.merge(self.fresh_index) self.fresh_index.terminate() # delete old files self.fresh_index = None self.flush(save_index=True) @gensim.utils.synchronous('lock_update') def drop_index(self, keep_model=True): """Drop all indexed documents. If `keep_model` is False, also dropped the model.""" modelstr = "" if keep_model else "and model " logger.info("deleting similarity index " + modelstr + "from %s" % self.basename) # delete indexes for index in [self.fresh_index, self.opt_index]: if index is not None: index.terminate() self.fresh_index, self.opt_index = None, None # delete payload if self.payload is not None: self.payload.close() fname = self.location('payload') try: if os.path.exists(fname): os.remove(fname) logger.info("deleted %s" % fname) except Exception, e: logger.warning("failed to delete %s" % fname) self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE) # optionally, delete the model as well if not keep_model and self.model is not None: self.model.close() fname = self.location('model') try: if os.path.exists(fname): os.remove(fname) logger.info("deleted %s" % fname) except Exception, e: logger.warning("failed to delete %s" % fname) self.model = None
class SimIndex(gensim.utils.SaveLoad): """ An index of documents. Used internally by SimServer. It uses the Similarity class to persist all document vectors to disk (via mmap). """ def __init__(self, fname, num_features, shardsize=SHARD_SIZE, topsims=TOP_SIMS): """ Spill index shards to disk after every `shardsize` documents. In similarity queries, return only the `topsims` most similar documents. """ self.fname = fname self.shardsize = int(shardsize) self.topsims = int(topsims) self.id2pos = { } # map document id (string) to index position (integer) self.pos2id = { } # reverse mapping for id2pos; redundant, for performance self.id2sims = SqliteDict( self.fname + '.id2sims', journal_mode=JOURNAL_MODE ) # precomputed top similar: document id -> [(doc_id, similarity)] self.qindex = gensim.similarities.Similarity(self.fname + '.idx', corpus=None, num_best=None, num_features=num_features, shardsize=shardsize) self.length = 0 def save(self, fname): tmp, self.id2sims = self.id2sims, None super(SimIndex, self).save(fname) self.id2sims = tmp @staticmethod def load(fname): result = gensim.utils.SaveLoad.load(fname) result.fname = fname result.check_moved() result.id2sims = SqliteDict(fname + '.id2sims', journal_mode=JOURNAL_MODE) return result def check_moved(self): output_prefix = self.fname + '.idx' if self.qindex.output_prefix != output_prefix: logger.info( "index seems to have moved from %s to %s; updating locations" % (self.qindex.output_prefix, output_prefix)) self.qindex.output_prefix = output_prefix self.qindex.check_moved() def close(self): "Explicitly release important resources (file handles, db, ...)" try: self.id2sims.close() except: pass try: del self.qindex except: pass def terminate(self): """Delete all files created by this index, invalidating `self`. Use with care.""" try: self.id2sims.terminate() except: pass import glob for fname in glob.glob(self.fname + '*'): try: os.remove(fname) logger.info("deleted %s" % fname) #except Exception, e: except Exception as e: logger.warning("failed to delete %s: %s" % (fname, e)) for val in self.__dict__.keys(): try: delattr(self, val) except: pass def index_documents(self, fresh_docs, model): """ Update fresh index with new documents (potentially replacing old ones with the same id). `fresh_docs` is a dictionary-like object (=dict, sqlitedict, shelve etc) that maps document_id->document. """ docids = fresh_docs.keys() vectors = (model.docs2vecs(fresh_docs[docid] for docid in docids)) logger.info("adding %i documents to %s" % (len(docids), self)) self.qindex.add_documents(vectors) self.qindex.save() self.update_ids(docids) def update_ids(self, docids): """Update id->pos mapping with new document ids.""" logger.info("updating %i id mappings" % len(docids)) for docid in docids: if docid is not None: pos = self.id2pos.get(docid, None) if pos is not None: logger.info("replacing existing document %r in %s" % (docid, self)) del self.pos2id[pos] self.id2pos[docid] = self.length try: del self.id2sims[docid] except: pass self.length += 1 self.id2sims.sync() self.update_mappings() def update_mappings(self): """Synchronize id<->position mappings.""" #self.pos2id = dict((v, k) for k, v in self.id2pos.iteritems()) self.pos2id = dict((v, k) for k, v in self.id2pos.items()) assert len(self.pos2id) == len( self.id2pos), "duplicate ids or positions detected" def delete(self, docids): """Delete documents (specified by their ids) from the index.""" logger.debug("deleting %i documents from %s" % (len(docids), self)) deleted = 0 for docid in docids: try: del self.id2pos[docid] deleted += 1 del self.id2sims[docid] except: pass self.id2sims.sync() if deleted: logger.info("deleted %i documents from %s" % (deleted, self)) self.update_mappings() def sims2scores(self, sims, eps=1e-7): """Convert raw similarity vector to a list of (docid, similarity) results.""" result = [] if isinstance(sims, numpy.ndarray): sims = abs( sims ) # TODO or maybe clip? are opposite vectors "similar" or "dissimilar"?! for pos in numpy.argsort(sims)[::-1]: if pos in self.pos2id and sims[ pos] > eps: # ignore deleted/rewritten documents # convert positions of resulting docs back to ids result.append((self.pos2id[pos], sims[pos])) if len(result) == self.topsims: break else: for pos, score in sims: if pos in self.pos2id and abs( score) > eps: # ignore deleted/rewritten documents # convert positions of resulting docs back to ids result.append((self.pos2id[pos], abs(score))) if len(result) == self.topsims: break return result def vec_by_id(self, docid): """Return indexed vector corresponding to document `docid`.""" pos = self.id2pos[docid] return self.qindex.vector_by_id(pos) def sims_by_id(self, docid): """Find the most similar documents to the (already indexed) document with `docid`.""" result = self.id2sims.get(docid, None) if result is None: self.qindex.num_best = self.topsims sims = self.qindex.similarity_by_id(self.id2pos[docid]) result = self.sims2scores(sims) return result def sims_by_vec(self, vec, normalize=None): """ Find the most similar documents to a given vector (=already processed document). """ if normalize is None: #normalize = self.qindex.normalize normalize = self.qindex.norm #norm, self.qindex.normalize = self.qindex.normalize, normalize # store old value norm, self.qindex.norm = self.qindex.norm, normalize # store old value self.qindex.num_best = self.topsims sims = self.qindex[vec] #self.qindex.normalize = norm # restore old value of qindex.normalize self.qindex.norm = norm # restore old value of qindex.norm return self.sims2scores(sims) def merge(self, other): """Merge documents from the other index. Update precomputed similarities in the process.""" #other.qindex.normalize, other.qindex.num_best = False, self.topsims other.qindex.norm, other.qindex.num_best = False, self.topsims # update precomputed "most similar" for old documents (in case some of # the new docs make it to the top-N for some of the old documents) logger.info("updating old precomputed values") pos, lenself = 0, len(self.qindex) for chunk in self.qindex.iter_chunks(): for sims in other.qindex[chunk]: if pos in self.pos2id: # ignore masked entries (deleted, overwritten documents) docid = self.pos2id[pos] sims = self.sims2scores(sims) self.id2sims[docid] = merge_sims(self.id2sims[docid], sims, self.topsims) pos += 1 if pos % 10000 == 0: logger.info("PROGRESS: updated doc #%i/%i" % (pos, lenself)) self.id2sims.sync() logger.info("merging fresh index into optimized one") pos, docids = 0, [] for chunk in other.qindex.iter_chunks(): for vec in chunk: if pos in other.pos2id: # don't copy deleted documents self.qindex.add_documents([vec]) docids.append(other.pos2id[pos]) pos += 1 self.qindex.save() self.update_ids(docids) logger.info("precomputing most similar for the fresh index") pos, lenother = 0, len(other.qindex) #norm, self.qindex.normalize = self.qindex.normalize, False norm, self.qindex.norm = self.qindex.norm, False topsims, self.qindex.num_best = self.qindex.num_best, self.topsims for chunk in other.qindex.iter_chunks(): for sims in self.qindex[chunk]: if pos in other.pos2id: # ignore masked entries (deleted, overwritten documents) docid = other.pos2id[pos] self.id2sims[docid] = self.sims2scores(sims) pos += 1 if pos % 10000 == 0: logger.info("PROGRESS: precomputed doc #%i/%i" % (pos, lenother)) #self.qindex.normalize, self.qindex.num_best = norm, topsims self.qindex.norm, self.qindex.num_best = norm, topsims self.id2sims.sync() def __len__(self): return len(self.id2pos) def __contains__(self, docid): return docid in self.id2pos def keys(self): return self.id2pos.keys() def __str__(self): return "SimIndex(%i docs, %i real size)" % (len(self), self.length)
class SimServer(object): """ Top-level functionality for similarity services. A similarity server takes care of:: 1. creating semantic models 2. indexing documents using these models 3. finding the most similar documents in an index. An object of this class can be shared across network via Pyro, to answer remote client requests. It is thread safe. Using a server concurrently from multiple processes is safe for reading = answering similarity queries. Modifying (training/indexing) is realized via locking = serialized internally. """ def __init__(self, basename, use_locks=False): """ All data will be stored under directory `basename`. If there is a server there already, it will be loaded (resumed). The server object is stateless in RAM -- its state is defined entirely by its location. There is therefore no need to store the server object. """ if not os.path.isdir(basename): raise ValueError("%r must be a writable directory" % basename) self.basename = basename self.use_locks = use_locks self.lock_update = threading.RLock( ) if use_locks else gensim.utils.nocm #self.lock_update = threading._RLock if use_locks else gensim.utils.nocm #self.lock_update = RLock() if use_locks else gensim.utils.nocm try: self.fresh_index = SimIndex.load(self.location('index_fresh')) except: logger.debug("starting a new fresh index") self.fresh_index = None try: self.opt_index = SimIndex.load(self.location('index_opt')) except: logger.debug("starting a new optimized index") self.opt_index = None try: self.model = SimModel.load(self.location('model')) except: self.model = None self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE) self.flush(save_index=False, save_model=False, clear_buffer=True) logger.info("loaded %s" % self) def location(self, name): return os.path.join(self.basename, name) @gensim.utils.synchronous('lock_update') def flush(self, save_index=False, save_model=False, clear_buffer=False): """Commit all changes, clear all caches.""" if save_index: if self.fresh_index is not None: self.fresh_index.save(self.location('index_fresh')) if self.opt_index is not None: self.opt_index.save(self.location('index_opt')) if save_model: if self.model is not None: self.model.save(self.location('model')) self.payload.commit() if clear_buffer: if hasattr(self, 'fresh_docs'): try: self.fresh_docs.terminate( ) # erase all buffered documents + file on disk except: pass self.fresh_docs = SqliteDict( journal_mode=JOURNAL_MODE ) # buffer defaults to a random location in temp self.fresh_docs.sync() def close(self): """Explicitly close open file handles, databases etc.""" try: self.payload.close() except: pass try: self.model.close() except: pass try: self.fresh_index.close() except: pass try: self.opt_index.close() except: pass try: self.fresh_docs.terminate() except: pass def __del__(self): """When the server went out of scope, make an effort to close its DBs.""" self.close() @gensim.utils.synchronous('lock_update') def buffer(self, documents): """ Add a sequence of documents to be processed (indexed or trained on). Here, the documents are simply collected; real processing is done later, during the `self.index` or `self.train` calls. `buffer` can be called repeatedly; the result is the same as if it was called once, with a concatenation of all the partial document batches. The point is to save memory when sending large corpora over network: the entire `documents` must be serialized into RAM. See `utils.upload_chunked()`. A call to `flush()` clears this documents-to-be-processed buffer (`flush` is also implicitly called when you call `index()` and `train()`). """ logger.info("adding documents to temporary buffer of %s" % (self)) for doc in documents: docid = doc['id'] # logger.debug("buffering document %r" % docid) if docid in self.fresh_docs: logger.warning("asked to re-add id %r; rewriting old value" % docid) self.fresh_docs[docid] = doc self.fresh_docs.sync() @gensim.utils.synchronous('lock_update') def train(self, corpus=None, method='auto', clear_buffer=True, params=None): """ Create an indexing model. Will overwrite the model if it already exists. All indexes become invalid, because documents in them use a now-obsolete representation. The model is trained on documents previously entered via `buffer`, or directly on `corpus`, if specified. """ if corpus is not None: # use the supplied corpus only (erase existing buffer, if any) self.flush(clear_buffer=True) self.buffer(corpus) if not self.fresh_docs: msg = "train called but no training corpus specified for %s" % self logger.error(msg) raise ValueError(msg) if method == 'auto': numdocs = len(self.fresh_docs) if numdocs < 1000: logging.warning( "too few training documents; using simple log-entropy model instead of latent semantic indexing" ) method = 'logentropy' else: method = 'lsi' if params is None: params = {} self.model = SimModel(self.fresh_docs, method=method, params=params) self.flush(save_model=True, clear_buffer=clear_buffer) @gensim.utils.synchronous('lock_update') def index(self, corpus=None, clear_buffer=True): """ Permanently index all documents previously added via `buffer`, or directly index documents from `corpus`, if specified. The indexing model must already exist (see `train`) before this function is called. """ if not self.model: msg = 'must initialize model for %s before indexing documents' % self.basename logger.error(msg) raise AttributeError(msg) if corpus is not None: # use the supplied corpus only (erase existing buffer, if any) self.flush(clear_buffer=True) self.buffer(corpus) if not self.fresh_docs: msg = "index called but no indexing corpus specified for %s" % self logger.error(msg) raise ValueError(msg) if not self.fresh_index: logger.info("starting a new fresh index for %s" % self) self.fresh_index = SimIndex(self.location('index_fresh'), self.model.num_features) self.fresh_index.index_documents(self.fresh_docs, self.model) if self.opt_index is not None: self.opt_index.delete(self.fresh_docs.keys()) logger.info("storing document payloads") for docid in self.fresh_docs: payload = self.fresh_docs[docid].get('payload', None) if payload is None: # HACK: exit on first doc without a payload (=assume all docs have payload, or none does) break self.payload[docid] = payload self.flush(save_index=True, clear_buffer=clear_buffer) @gensim.utils.synchronous('lock_update') def optimize(self): """ Precompute top similarities for all indexed documents. This speeds up `find_similar` queries by id (but not queries by fulltext). Internally, documents are moved from a fresh index (=no precomputed similarities) to an optimized index (precomputed similarities). Similarity queries always query both indexes, so this split is transparent to clients. If you add documents later via `index`, they go to the fresh index again. To precompute top similarities for these new documents too, simply call `optimize` again. """ if self.fresh_index is None: logger.warning("optimize called but there are no new documents") return # nothing to do! if self.opt_index is None: logger.info("starting a new optimized index for %s" % self) self.opt_index = SimIndex(self.location('index_opt'), self.model.num_features) self.opt_index.merge(self.fresh_index) self.fresh_index.terminate() # delete old files self.fresh_index = None self.flush(save_index=True) @gensim.utils.synchronous('lock_update') def drop_index(self, keep_model=True): """Drop all indexed documents. If `keep_model` is False, also dropped the model.""" modelstr = "" if keep_model else "and model " logger.info("deleting similarity index " + modelstr + "from %s" % self.basename) # delete indexes for index in [self.fresh_index, self.opt_index]: if index is not None: index.terminate() self.fresh_index, self.opt_index = None, None # delete payload if self.payload is not None: self.payload.close() fname = self.location('payload') try: if os.path.exists(fname): os.remove(fname) logger.info("deleted %s" % fname) #except Exception, e: except Exception as e: logger.warning("failed to delete %s" % fname) self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE) # optionally, delete the model as well if not keep_model and self.model is not None: self.model.close() fname = self.location('model') try: if os.path.exists(fname): os.remove(fname) logger.info("deleted %s" % fname) #except Exception, e: except Exception as e: logger.warning("failed to delete %s" % fname) self.model = None self.flush(save_index=True, save_model=True, clear_buffer=True) @gensim.utils.synchronous('lock_update') def delete(self, docids): """Delete specified documents from the index.""" logger.info("asked to drop %i documents" % len(docids)) for index in [self.opt_index, self.fresh_index]: if index is not None: index.delete(docids) self.flush(save_index=True) def is_locked(self): #return self.use_locks and self.lock_update._RLock__count > 0 #return self.use_locks #return self.use_locks and self.lock_update._RLock._count > 0 return self.use_locks and self.lock_update._count > 0 def vec_by_id(self, docid): for index in [self.opt_index, self.fresh_index]: if index is not None and docid in index: return index.vec_by_id(docid) def find_similar(self, doc, min_score=0.0, max_results=100): """ Find `max_results` most similar articles in the index, each having similarity score of at least `min_score`. The resulting list may be shorter than `max_results`, in case there are not enough matching documents. `doc` is either a string (=document id, previously indexed) or a dict containing a 'tokens' key. These tokens are processed to produce a vector, which is then used as a query against the index. The similar documents are returned in decreasing similarity order, as `(doc_id, similarity_score, doc_payload)` 3-tuples. The payload returned is identical to what was supplied for this document during indexing. """ logger.debug("received query call with %r" % doc) if self.is_locked(): msg = "cannot query while the server is being updated" logger.error(msg) raise RuntimeError(msg) sims_opt, sims_fresh = None, None for index in [self.fresh_index, self.opt_index]: if index is not None: index.topsims = max_results #if isinstance(doc, basestring): if isinstance(doc, str): # query by direct document id docid = doc if self.opt_index is not None and docid in self.opt_index: sims_opt = self.opt_index.sims_by_id(docid) if self.fresh_index is not None: vec = self.opt_index.vec_by_id(docid) sims_fresh = self.fresh_index.sims_by_vec(vec, normalize=False) elif self.fresh_index is not None and docid in self.fresh_index: sims_fresh = self.fresh_index.sims_by_id(docid) if self.opt_index is not None: vec = self.fresh_index.vec_by_id(docid) sims_opt = self.opt_index.sims_by_vec(vec, normalize=False) else: raise ValueError("document %r not in index" % docid) else: if 'topics' in doc: # user supplied vector directly => use that vec = gensim.matutils.any2sparse(doc['topics']) else: # query by an arbitrary text (=tokens) inside doc['tokens'] vec = self.model.doc2vec( doc) # convert document (text) to vector if self.opt_index is not None: sims_opt = self.opt_index.sims_by_vec(vec) if self.fresh_index is not None: sims_fresh = self.fresh_index.sims_by_vec(vec) merged = merge_sims(sims_opt, sims_fresh) logger.debug( "got %s raw similars, pruning with max_results=%s, min_score=%s" % (len(merged), max_results, min_score)) result = [] for docid, score in merged: if score < min_score or 0 < max_results <= len(result): break result.append((docid, float(score), self.payload.get(docid, None))) return result #def find_similar(self, doc, min_score=0.0, max_results=100): def find_dissimilar(self, doc, max_score=1.0, max_results=100): """ Find `max_results` most similar articles in the index, each having similarity score of at least `min_score`. The resulting list may be shorter than `max_results`, in case there are not enough matching documents. `doc` is either a string (=document id, previously indexed) or a dict containing a 'tokens' key. These tokens are processed to produce a vector, which is then used as a query against the index. The similar documents are returned in decreasing similarity order, as `(doc_id, similarity_score, doc_payload)` 3-tuples. The payload returned is identical to what was supplied for this document during indexing. """ logger.debug("received query call with %r" % doc) if self.is_locked(): msg = "cannot query while the server is being updated" logger.error(msg) raise RuntimeError(msg) sims_opt, sims_fresh = None, None for index in [self.fresh_index, self.opt_index]: if index is not None: #index.topsims = max_results index.topsims = 10000000 #if isinstance(doc, basestring): if isinstance(doc, str): # query by direct document id docid = doc if self.opt_index is not None and docid in self.opt_index: sims_opt = self.opt_index.sims_by_id(docid) if self.fresh_index is not None: vec = self.opt_index.vec_by_id(docid) sims_fresh = self.fresh_index.sims_by_vec(vec, normalize=False) elif self.fresh_index is not None and docid in self.fresh_index: sims_fresh = self.fresh_index.sims_by_id(docid) if self.opt_index is not None: vec = self.fresh_index.vec_by_id(docid) sims_opt = self.opt_index.sims_by_vec(vec, normalize=False) else: raise ValueError("document %r not in index" % docid) else: if 'topics' in doc: # user supplied vector directly => use that vec = gensim.matutils.any2sparse(doc['topics']) else: # query by an arbitrary text (=tokens) inside doc['tokens'] vec = self.model.doc2vec( doc) # convert document (text) to vector if self.opt_index is not None: sims_opt = self.opt_index.sims_by_vec(vec) if self.fresh_index is not None: sims_fresh = self.fresh_index.sims_by_vec(vec) merged = merge_sims(sims_opt, sims_fresh) #merged.sort(reverse=False) merged.sort(key=lambda tup: tup[1], reverse=False) #logger.debug("got %s raw similars, pruning with max_results=%s, min_score=%s" % # (len(merged), max_results, min_score)) logger.debug( "got %s raw similars, pruning with max_results=%s, max_score=%s" % (len(merged), max_results, max_score)) result = [] #print("merged = ", merged) #print("len(merged) = ", len(merged)) for docid, score in merged: #if score < min_score or 0 < max_results <= len(result): #if score > max_score or 0 < max_results <= len(result): if score > max_score: if len(result) >= max_results: break #elif len(result) >= (10 * max_results): #elif len(result) >= (2 * max_results): elif len(result) >= (1 * max_results): break result.append((docid, float(score), self.payload.get(docid, None))) return result def __str__(self): return ("SimServer(loc=%r, fresh=%s, opt=%s, model=%s, buffer=%s)" % (self.basename, self.fresh_index, self.opt_index, self.model, self.fresh_docs)) def __len__(self): return sum( len(index) for index in [self.opt_index, self.fresh_index] if index is not None) def __contains__(self, docid): """Is document with `docid` in the index?""" return any(index is not None and docid in index for index in [self.opt_index, self.fresh_index]) def get_tfidf(self, *args, **kwargs): return self.model.get_tfidf(*args, **kwargs) def status(self): return str(self) def keys(self): """Return ids of all indexed documents.""" result = [] if self.fresh_index is not None: result += self.fresh_index.keys() if self.opt_index is not None: result += self.opt_index.keys() return result def memdebug(self): from guppy import hpy return str(hpy().heap())