Beispiel #1
0
class SqliteDictTerminateFailTest(unittest.TestCase):
    """Provide Coverage for SqliteDict.terminate()."""
    def setUp(self):
        self.fname = norm_file('tests/db-permdenied/sqlitedict.sqlite')
        self.db = SqliteDict(filename=self.fname)
        os.chmod(self.fname, 0o000)
        os.chmod(os.path.dirname(self.fname), 0o000)

    def tearDown(self):
        os.chmod(os.path.dirname(self.fname), 0o700)
        os.chmod(self.fname, 0o600)
        os.unlink(self.fname)
        os.rmdir(os.path.dirname(self.fname))

    def test_terminate_cannot_delete(self):
        # exercise,
        self.db.terminate()  # deletion failed, but no exception raised!

        # verify,
        os.chmod(os.path.dirname(self.fname), 0o700)
        os.chmod(self.fname, 0o600)
        self.assertTrue(os.path.exists(self.fname))
class SqliteDictTerminateFailTest(unittest.TestCase):
    """Provide Coverage for SqliteDict.terminate()."""

    def setUp(self):
        self.fname = norm_file('tests/db-permdenied/sqlitedict.sqlite')
        self.db = SqliteDict(filename=self.fname)
        os.chmod(self.fname, 0o000)
        os.chmod(os.path.dirname(self.fname), 0o000)

    def tearDown(self):
        os.chmod(os.path.dirname(self.fname), 0o700)
        os.chmod(self.fname, 0o600)
        os.unlink(self.fname)
        os.rmdir(os.path.dirname(self.fname))

    def test_terminate_cannot_delete(self):
        # exercise,
        self.db.terminate()  # deletion failed, but no exception raised!

        # verify,
        os.chmod(os.path.dirname(self.fname), 0o700)
        os.chmod(self.fname, 0o600)
        self.assertTrue(os.path.exists(self.fname))
Beispiel #3
0
class SimIndex(gensim.utils.SaveLoad):
    """
    An index of documents. Used internally by SimServer.

    It uses the Similarity class to persist all document vectors to disk (via mmap).
    """
    def __init__(self, fname, num_features, shardsize=SHARD_SIZE, topsims=TOP_SIMS):
        """
        Spill index shards to disk after every `shardsize` documents.
        In similarity queries, return only the `topsims` most similar documents.
        """
        self.fname = fname
        self.shardsize = int(shardsize)
        self.topsims = int(topsims)
        self.id2pos = {} # map document id (string) to index position (integer)
        self.pos2id = {} # reverse mapping for id2pos; redundant, for performance
        self.id2sims = SqliteDict(self.fname + '.id2sims', journal_mode=JOURNAL_MODE) # precomputed top similar: document id -> [(doc_id, similarity)]
        self.qindex = gensim.similarities.Similarity(self.fname + '.idx', corpus=None,
            num_best=None, num_features=num_features, shardsize=shardsize)
        self.length = 0

    def save(self, fname):
        tmp, self.id2sims = self.id2sims, None
        super(SimIndex, self).save(fname)
        self.id2sims = tmp


    @staticmethod
    def load(fname):
        result = gensim.utils.SaveLoad.load(fname)
        result.fname = fname
        result.check_moved()
        result.id2sims = SqliteDict(fname + '.id2sims', journal_mode=JOURNAL_MODE)
        return result


    def check_moved(self):
        output_prefix = self.fname + '.idx'
        if self.qindex.output_prefix != output_prefix:
            logger.info("index seems to have moved from %s to %s; updating locations" %
                (self.qindex.output_prefix, output_prefix))
            self.qindex.output_prefix = output_prefix
            self.qindex.check_moved()


    def close(self):
        "Explicitly release important resources (file handles, db, ...)"
        try:
            self.id2sims.close()
        except:
            pass
        try:
            del self.qindex
        except:
            pass


    def terminate(self):
        """Delete all files created by this index, invalidating `self`. Use with care."""
        try:
            self.id2sims.terminate()
        except:
            pass
        import glob
        for fname in glob.glob(self.fname + '*'):
            try:
                os.remove(fname)
                logger.info("deleted %s" % fname)
            except Exception, e:
                logger.warning("failed to delete %s: %s" % (fname, e))
        for val in self.__dict__.keys():
            try:
                delattr(self, val)
            except:
                pass
Beispiel #4
0
class SimServer(object):
    """
    Top-level functionality for similarity services. A similarity server takes
    care of::

    1. creating semantic models
    2. indexing documents using these models
    3. finding the most similar documents in an index.

    An object of this class can be shared across network via Pyro, to answer remote
    client requests. It is thread safe. Using a server concurrently from multiple
    processes is safe for reading = answering similarity queries. Modifying
    (training/indexing) is realized via locking = serialized internally.
    """
    def __init__(self, basename, use_locks=False):
        """
        All data will be stored under directory `basename`. If there is a server
        there already, it will be loaded (resumed).

        The server object is stateless in RAM -- its state is defined entirely by its location.
        There is therefore no need to store the server object.
        """
        if not os.path.isdir(basename):
            raise ValueError("%r must be a writable directory" % basename)
        self.basename = basename
        self.use_locks = use_locks
        self.lock_update = threading.RLock() if use_locks else gensim.utils.nocm
        try:
            self.fresh_index = SimIndex.load(self.location('index_fresh'))
        except:
            logger.debug("starting a new fresh index")
            self.fresh_index = None
        try:
            self.opt_index = SimIndex.load(self.location('index_opt'))
        except:
            logger.debug("starting a new optimized index")
            self.opt_index = None
        try:
            self.model = SimModel.load(self.location('model'))
        except:
            self.model = None
        self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE)
        self.flush(save_index=False, save_model=False, clear_buffer=True)
        logger.info("loaded %s" % self)


    def location(self, name):
        return os.path.join(self.basename, name)


    @gensim.utils.synchronous('lock_update')
    def flush(self, save_index=False, save_model=False, clear_buffer=False):
        """Commit all changes, clear all caches."""
        if save_index:
            if self.fresh_index is not None:
                self.fresh_index.save(self.location('index_fresh'))
            if self.opt_index is not None:
                self.opt_index.save(self.location('index_opt'))
        if save_model:
            if self.model is not None:
                self.model.save(self.location('model'))
        self.payload.commit()
        if clear_buffer:
            if hasattr(self, 'fresh_docs'):
                try:
                    self.fresh_docs.terminate() # erase all buffered documents + file on disk
                except:
                    pass
            self.fresh_docs = SqliteDict(journal_mode=JOURNAL_MODE) # buffer defaults to a random location in temp
        self.fresh_docs.sync()


    def close(self):
        """Explicitly close open file handles, databases etc."""
        try:
            self.payload.close()
        except:
            pass
        try:
            self.model.close()
        except:
            pass
        try:
            self.fresh_index.close()
        except:
            pass
        try:
            self.opt_index.close()
        except:
            pass
        try:
            self.fresh_docs.terminate()
        except:
            pass

    def __del__(self):
        """When the server went out of scope, make an effort to close its DBs."""
        self.close()

    @gensim.utils.synchronous('lock_update')
    def buffer(self, documents):
        """
        Add a sequence of documents to be processed (indexed or trained on).

        Here, the documents are simply collected; real processing is done later,
        during the `self.index` or `self.train` calls.

        `buffer` can be called repeatedly; the result is the same as if it was
        called once, with a concatenation of all the partial document batches.
        The point is to save memory when sending large corpora over network: the
        entire `documents` must be serialized into RAM. See `utils.upload_chunked()`.

        A call to `flush()` clears this documents-to-be-processed buffer (`flush`
        is also implicitly called when you call `index()` and `train()`).
        """
        logger.info("adding documents to temporary buffer of %s" % (self))
        for doc in documents:
            docid = doc['id']
#            logger.debug("buffering document %r" % docid)
            if docid in self.fresh_docs:
                logger.warning("asked to re-add id %r; rewriting old value" % docid)
            self.fresh_docs[docid] = doc
        self.fresh_docs.sync()


    @gensim.utils.synchronous('lock_update')
    def train(self, corpus=None, method='auto', clear_buffer=True, params=None):
        """
        Create an indexing model. Will overwrite the model if it already exists.
        All indexes become invalid, because documents in them use a now-obsolete
        representation.

        The model is trained on documents previously entered via `buffer`,
        or directly on `corpus`, if specified.
        """
        if corpus is not None:
            # use the supplied corpus only (erase existing buffer, if any)
            self.flush(clear_buffer=True)
            self.buffer(corpus)
        if not self.fresh_docs:
            msg = "train called but no training corpus specified for %s" % self
            logger.error(msg)
            raise ValueError(msg)
        if method == 'auto':
            numdocs = len(self.fresh_docs)
            if numdocs < 1000:
                logging.warning("too few training documents; using simple log-entropy model instead of latent semantic indexing")
                method = 'logentropy'
            else:
                method = 'lsi'
        if params is None:
            params = {}
        self.model = SimModel(self.fresh_docs, method=method, params=params)
        self.flush(save_model=True, clear_buffer=clear_buffer)


    @gensim.utils.synchronous('lock_update')
    def index(self, corpus=None, clear_buffer=True):
        """
        Permanently index all documents previously added via `buffer`, or
        directly index documents from `corpus`, if specified.

        The indexing model must already exist (see `train`) before this function
        is called.
        """
        if not self.model:
            msg = 'must initialize model for %s before indexing documents' % self.basename
            logger.error(msg)
            raise AttributeError(msg)

        if corpus is not None:
            # use the supplied corpus only (erase existing buffer, if any)
            self.flush(clear_buffer=True)
            self.buffer(corpus)

        if not self.fresh_docs:
            msg = "index called but no indexing corpus specified for %s" % self
            logger.error(msg)
            raise ValueError(msg)

        if not self.fresh_index:
            logger.info("starting a new fresh index for %s" % self)
            self.fresh_index = SimIndex(self.location('index_fresh'), self.model.num_features)
        self.fresh_index.index_documents(self.fresh_docs, self.model)
        if self.opt_index is not None:
            self.opt_index.delete(self.fresh_docs.keys())
        logger.info("storing document payloads")
        for docid in self.fresh_docs:
            payload = self.fresh_docs[docid].get('payload', None)
            if payload is None:
                # HACK: exit on first doc without a payload (=assume all docs have payload, or none does)
                break
            self.payload[docid] = payload
        self.flush(save_index=True, clear_buffer=clear_buffer)


    @gensim.utils.synchronous('lock_update')
    def optimize(self):
        """
        Precompute top similarities for all indexed documents. This speeds up
        `find_similar` queries by id (but not queries by fulltext).

        Internally, documents are moved from a fresh index (=no precomputed similarities)
        to an optimized index (precomputed similarities). Similarity queries always
        query both indexes, so this split is transparent to clients.

        If you add documents later via `index`, they go to the fresh index again.
        To precompute top similarities for these new documents too, simply call
        `optimize` again.

        """
        if self.fresh_index is None:
            logger.warning("optimize called but there are no new documents")
            return # nothing to do!

        if self.opt_index is None:
            logger.info("starting a new optimized index for %s" % self)
            self.opt_index = SimIndex(self.location('index_opt'), self.model.num_features)

        self.opt_index.merge(self.fresh_index)
        self.fresh_index.terminate() # delete old files
        self.fresh_index = None
        self.flush(save_index=True)


    @gensim.utils.synchronous('lock_update')
    def drop_index(self, keep_model=True):
        """Drop all indexed documents. If `keep_model` is False, also dropped the model."""
        modelstr = "" if keep_model else "and model "
        logger.info("deleting similarity index " + modelstr + "from %s" % self.basename)

        # delete indexes
        for index in [self.fresh_index, self.opt_index]:
            if index is not None:
                index.terminate()
        self.fresh_index, self.opt_index = None, None

        # delete payload
        if self.payload is not None:
            self.payload.close()

            fname = self.location('payload')
            try:
                if os.path.exists(fname):
                    os.remove(fname)
                    logger.info("deleted %s" % fname)
            except Exception, e:
                logger.warning("failed to delete %s" % fname)
        self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE)

        # optionally, delete the model as well
        if not keep_model and self.model is not None:
            self.model.close()
            fname = self.location('model')
            try:
                if os.path.exists(fname):
                    os.remove(fname)
                    logger.info("deleted %s" % fname)
            except Exception, e:
                logger.warning("failed to delete %s" % fname)
            self.model = None
class SimIndex(gensim.utils.SaveLoad):
    """
    An index of documents. Used internally by SimServer.

    It uses the Similarity class to persist all document vectors to disk (via mmap).
    """
    def __init__(self,
                 fname,
                 num_features,
                 shardsize=SHARD_SIZE,
                 topsims=TOP_SIMS):
        """
        Spill index shards to disk after every `shardsize` documents.
        In similarity queries, return only the `topsims` most similar documents.
        """
        self.fname = fname
        self.shardsize = int(shardsize)
        self.topsims = int(topsims)
        self.id2pos = {
        }  # map document id (string) to index position (integer)
        self.pos2id = {
        }  # reverse mapping for id2pos; redundant, for performance
        self.id2sims = SqliteDict(
            self.fname + '.id2sims', journal_mode=JOURNAL_MODE
        )  # precomputed top similar: document id -> [(doc_id, similarity)]
        self.qindex = gensim.similarities.Similarity(self.fname + '.idx',
                                                     corpus=None,
                                                     num_best=None,
                                                     num_features=num_features,
                                                     shardsize=shardsize)
        self.length = 0

    def save(self, fname):
        tmp, self.id2sims = self.id2sims, None
        super(SimIndex, self).save(fname)
        self.id2sims = tmp

    @staticmethod
    def load(fname):
        result = gensim.utils.SaveLoad.load(fname)
        result.fname = fname
        result.check_moved()
        result.id2sims = SqliteDict(fname + '.id2sims',
                                    journal_mode=JOURNAL_MODE)
        return result

    def check_moved(self):
        output_prefix = self.fname + '.idx'
        if self.qindex.output_prefix != output_prefix:
            logger.info(
                "index seems to have moved from %s to %s; updating locations" %
                (self.qindex.output_prefix, output_prefix))
            self.qindex.output_prefix = output_prefix
            self.qindex.check_moved()

    def close(self):
        "Explicitly release important resources (file handles, db, ...)"
        try:
            self.id2sims.close()
        except:
            pass
        try:
            del self.qindex
        except:
            pass

    def terminate(self):
        """Delete all files created by this index, invalidating `self`. Use with care."""
        try:
            self.id2sims.terminate()
        except:
            pass
        import glob
        for fname in glob.glob(self.fname + '*'):
            try:
                os.remove(fname)
                logger.info("deleted %s" % fname)
            except Exception, e:
                logger.warning("failed to delete %s: %s" % (fname, e))
        for val in self.__dict__.keys():
            try:
                delattr(self, val)
            except:
                pass
class SimServer(object):
    """
    Top-level functionality for similarity services. A similarity server takes
    care of::

    1. creating semantic models
    2. indexing documents using these models
    3. finding the most similar documents in an index.

    An object of this class can be shared across network via Pyro, to answer remote
    client requests. It is thread safe. Using a server concurrently from multiple
    processes is safe for reading = answering similarity queries. Modifying
    (training/indexing) is realized via locking = serialized internally.
    """
    def __init__(self, basename, use_locks=False):
        """
        All data will be stored under directory `basename`. If there is a server
        there already, it will be loaded (resumed).

        The server object is stateless in RAM -- its state is defined entirely by its location.
        There is therefore no need to store the server object.
        """
        if not os.path.isdir(basename):
            raise ValueError("%r must be a writable directory" % basename)
        self.basename = basename
        self.use_locks = use_locks
        self.lock_update = threading.RLock(
        ) if use_locks else gensim.utils.nocm
        try:
            self.fresh_index = SimIndex.load(self.location('index_fresh'))
        except:
            logger.debug("starting a new fresh index")
            self.fresh_index = None
        try:
            self.opt_index = SimIndex.load(self.location('index_opt'))
        except:
            logger.debug("starting a new optimized index")
            self.opt_index = None
        try:
            self.model = SimModel.load(self.location('model'))
        except:
            self.model = None
        self.payload = SqliteDict(self.location('payload'),
                                  autocommit=True,
                                  journal_mode=JOURNAL_MODE)
        self.flush(save_index=False, save_model=False, clear_buffer=True)
        logger.info("loaded %s" % self)

    def location(self, name):
        return os.path.join(self.basename, name)

    @gensim.utils.synchronous('lock_update')
    def flush(self, save_index=False, save_model=False, clear_buffer=False):
        """Commit all changes, clear all caches."""
        if save_index:
            if self.fresh_index is not None:
                self.fresh_index.save(self.location('index_fresh'))
            if self.opt_index is not None:
                self.opt_index.save(self.location('index_opt'))
        if save_model:
            if self.model is not None:
                self.model.save(self.location('model'))
        self.payload.commit()
        if clear_buffer:
            if hasattr(self, 'fresh_docs'):
                try:
                    self.fresh_docs.terminate(
                    )  # erase all buffered documents + file on disk
                except:
                    pass
            self.fresh_docs = SqliteDict(
                journal_mode=JOURNAL_MODE
            )  # buffer defaults to a random location in temp
        self.fresh_docs.sync()

    def close(self):
        """Explicitly close open file handles, databases etc."""
        try:
            self.payload.close()
        except:
            pass
        try:
            self.model.close()
        except:
            pass
        try:
            self.fresh_index.close()
        except:
            pass
        try:
            self.opt_index.close()
        except:
            pass
        try:
            self.fresh_docs.terminate()
        except:
            pass

    def __del__(self):
        """When the server went out of scope, make an effort to close its DBs."""
        self.close()

    @gensim.utils.synchronous('lock_update')
    def buffer(self, documents):
        """
        Add a sequence of documents to be processed (indexed or trained on).

        Here, the documents are simply collected; real processing is done later,
        during the `self.index` or `self.train` calls.

        `buffer` can be called repeatedly; the result is the same as if it was
        called once, with a concatenation of all the partial document batches.
        The point is to save memory when sending large corpora over network: the
        entire `documents` must be serialized into RAM. See `utils.upload_chunked()`.

        A call to `flush()` clears this documents-to-be-processed buffer (`flush`
        is also implicitly called when you call `index()` and `train()`).
        """
        logger.info("adding documents to temporary buffer of %s" % (self))
        for doc in documents:
            docid = doc['id']
            #            logger.debug("buffering document %r" % docid)
            if docid in self.fresh_docs:
                logger.warning("asked to re-add id %r; rewriting old value" %
                               docid)
            self.fresh_docs[docid] = doc
        self.fresh_docs.sync()

    @gensim.utils.synchronous('lock_update')
    def train(self,
              corpus=None,
              method='auto',
              clear_buffer=True,
              params=None):
        """
        Create an indexing model. Will overwrite the model if it already exists.
        All indexes become invalid, because documents in them use a now-obsolete
        representation.

        The model is trained on documents previously entered via `buffer`,
        or directly on `corpus`, if specified.
        """
        if corpus is not None:
            # use the supplied corpus only (erase existing buffer, if any)
            self.flush(clear_buffer=True)
            self.buffer(corpus)
        if not self.fresh_docs:
            msg = "train called but no training corpus specified for %s" % self
            logger.error(msg)
            raise ValueError(msg)
        if method == 'auto':
            numdocs = len(self.fresh_docs)
            if numdocs < 1000:
                logging.warning(
                    "too few training documents; using simple log-entropy model instead of latent semantic indexing"
                )
                method = 'logentropy'
            else:
                method = 'lsi'
        if params is None:
            params = {}
        self.model = SimModel(self.fresh_docs, method=method, params=params)
        self.flush(save_model=True, clear_buffer=clear_buffer)

    @gensim.utils.synchronous('lock_update')
    def index(self, corpus=None, clear_buffer=True):
        """
        Permanently index all documents previously added via `buffer`, or
        directly index documents from `corpus`, if specified.

        The indexing model must already exist (see `train`) before this function
        is called.
        """
        if not self.model:
            msg = 'must initialize model for %s before indexing documents' % self.basename
            logger.error(msg)
            raise AttributeError(msg)

        if corpus is not None:
            # use the supplied corpus only (erase existing buffer, if any)
            self.flush(clear_buffer=True)
            self.buffer(corpus)

        if not self.fresh_docs:
            msg = "index called but no indexing corpus specified for %s" % self
            logger.error(msg)
            raise ValueError(msg)

        if not self.fresh_index:
            logger.info("starting a new fresh index for %s" % self)
            self.fresh_index = SimIndex(self.location('index_fresh'),
                                        self.model.num_features)
        self.fresh_index.index_documents(self.fresh_docs, self.model)
        if self.opt_index is not None:
            self.opt_index.delete(self.fresh_docs.keys())
        logger.info("storing document payloads")
        for docid in self.fresh_docs:
            payload = self.fresh_docs[docid].get('payload', None)
            if payload is None:
                # HACK: exit on first doc without a payload (=assume all docs have payload, or none does)
                break
            self.payload[docid] = payload
        self.flush(save_index=True, clear_buffer=clear_buffer)

    @gensim.utils.synchronous('lock_update')
    def optimize(self):
        """
        Precompute top similarities for all indexed documents. This speeds up
        `find_similar` queries by id (but not queries by fulltext).

        Internally, documents are moved from a fresh index (=no precomputed similarities)
        to an optimized index (precomputed similarities). Similarity queries always
        query both indexes, so this split is transparent to clients.

        If you add documents later via `index`, they go to the fresh index again.
        To precompute top similarities for these new documents too, simply call
        `optimize` again.

        """
        if self.fresh_index is None:
            logger.warning("optimize called but there are no new documents")
            return  # nothing to do!

        if self.opt_index is None:
            logger.info("starting a new optimized index for %s" % self)
            self.opt_index = SimIndex(self.location('index_opt'),
                                      self.model.num_features)

        self.opt_index.merge(self.fresh_index)
        self.fresh_index.terminate()  # delete old files
        self.fresh_index = None
        self.flush(save_index=True)

    @gensim.utils.synchronous('lock_update')
    def drop_index(self, keep_model=True):
        """Drop all indexed documents. If `keep_model` is False, also dropped the model."""
        modelstr = "" if keep_model else "and model "
        logger.info("deleting similarity index " + modelstr +
                    "from %s" % self.basename)

        # delete indexes
        for index in [self.fresh_index, self.opt_index]:
            if index is not None:
                index.terminate()
        self.fresh_index, self.opt_index = None, None

        # delete payload
        if self.payload is not None:
            self.payload.close()

            fname = self.location('payload')
            try:
                if os.path.exists(fname):
                    os.remove(fname)
                    logger.info("deleted %s" % fname)
            except Exception, e:
                logger.warning("failed to delete %s" % fname)
        self.payload = SqliteDict(self.location('payload'),
                                  autocommit=True,
                                  journal_mode=JOURNAL_MODE)

        # optionally, delete the model as well
        if not keep_model and self.model is not None:
            self.model.close()
            fname = self.location('model')
            try:
                if os.path.exists(fname):
                    os.remove(fname)
                    logger.info("deleted %s" % fname)
            except Exception, e:
                logger.warning("failed to delete %s" % fname)
            self.model = None
class SimIndex(gensim.utils.SaveLoad):
    """
    An index of documents. Used internally by SimServer.

    It uses the Similarity class to persist all document vectors to disk (via mmap).
    """
    def __init__(self,
                 fname,
                 num_features,
                 shardsize=SHARD_SIZE,
                 topsims=TOP_SIMS):
        """
        Spill index shards to disk after every `shardsize` documents.
        In similarity queries, return only the `topsims` most similar documents.
        """
        self.fname = fname
        self.shardsize = int(shardsize)
        self.topsims = int(topsims)
        self.id2pos = {
        }  # map document id (string) to index position (integer)
        self.pos2id = {
        }  # reverse mapping for id2pos; redundant, for performance
        self.id2sims = SqliteDict(
            self.fname + '.id2sims', journal_mode=JOURNAL_MODE
        )  # precomputed top similar: document id -> [(doc_id, similarity)]
        self.qindex = gensim.similarities.Similarity(self.fname + '.idx',
                                                     corpus=None,
                                                     num_best=None,
                                                     num_features=num_features,
                                                     shardsize=shardsize)
        self.length = 0

    def save(self, fname):
        tmp, self.id2sims = self.id2sims, None
        super(SimIndex, self).save(fname)
        self.id2sims = tmp

    @staticmethod
    def load(fname):
        result = gensim.utils.SaveLoad.load(fname)
        result.fname = fname
        result.check_moved()
        result.id2sims = SqliteDict(fname + '.id2sims',
                                    journal_mode=JOURNAL_MODE)
        return result

    def check_moved(self):
        output_prefix = self.fname + '.idx'
        if self.qindex.output_prefix != output_prefix:
            logger.info(
                "index seems to have moved from %s to %s; updating locations" %
                (self.qindex.output_prefix, output_prefix))
            self.qindex.output_prefix = output_prefix
            self.qindex.check_moved()

    def close(self):
        "Explicitly release important resources (file handles, db, ...)"
        try:
            self.id2sims.close()
        except:
            pass
        try:
            del self.qindex
        except:
            pass

    def terminate(self):
        """Delete all files created by this index, invalidating `self`. Use with care."""
        try:
            self.id2sims.terminate()
        except:
            pass
        import glob
        for fname in glob.glob(self.fname + '*'):
            try:
                os.remove(fname)
                logger.info("deleted %s" % fname)
            #except Exception, e:
            except Exception as e:
                logger.warning("failed to delete %s: %s" % (fname, e))
        for val in self.__dict__.keys():
            try:
                delattr(self, val)
            except:
                pass

    def index_documents(self, fresh_docs, model):
        """
        Update fresh index with new documents (potentially replacing old ones with
        the same id). `fresh_docs` is a dictionary-like object (=dict, sqlitedict, shelve etc)
        that maps document_id->document.
        """
        docids = fresh_docs.keys()
        vectors = (model.docs2vecs(fresh_docs[docid] for docid in docids))
        logger.info("adding %i documents to %s" % (len(docids), self))
        self.qindex.add_documents(vectors)
        self.qindex.save()
        self.update_ids(docids)

    def update_ids(self, docids):
        """Update id->pos mapping with new document ids."""
        logger.info("updating %i id mappings" % len(docids))
        for docid in docids:
            if docid is not None:
                pos = self.id2pos.get(docid, None)
                if pos is not None:
                    logger.info("replacing existing document %r in %s" %
                                (docid, self))
                    del self.pos2id[pos]
                self.id2pos[docid] = self.length
                try:
                    del self.id2sims[docid]
                except:
                    pass
            self.length += 1
        self.id2sims.sync()
        self.update_mappings()

    def update_mappings(self):
        """Synchronize id<->position mappings."""
        #self.pos2id = dict((v, k) for k, v in self.id2pos.iteritems())
        self.pos2id = dict((v, k) for k, v in self.id2pos.items())
        assert len(self.pos2id) == len(
            self.id2pos), "duplicate ids or positions detected"

    def delete(self, docids):
        """Delete documents (specified by their ids) from the index."""
        logger.debug("deleting %i documents from %s" % (len(docids), self))
        deleted = 0
        for docid in docids:
            try:
                del self.id2pos[docid]
                deleted += 1
                del self.id2sims[docid]
            except:
                pass
        self.id2sims.sync()
        if deleted:
            logger.info("deleted %i documents from %s" % (deleted, self))
        self.update_mappings()

    def sims2scores(self, sims, eps=1e-7):
        """Convert raw similarity vector to a list of (docid, similarity) results."""
        result = []
        if isinstance(sims, numpy.ndarray):
            sims = abs(
                sims
            )  # TODO or maybe clip? are opposite vectors "similar" or "dissimilar"?!
            for pos in numpy.argsort(sims)[::-1]:
                if pos in self.pos2id and sims[
                        pos] > eps:  # ignore deleted/rewritten documents
                    # convert positions of resulting docs back to ids
                    result.append((self.pos2id[pos], sims[pos]))
                    if len(result) == self.topsims:
                        break
        else:
            for pos, score in sims:
                if pos in self.pos2id and abs(
                        score) > eps:  # ignore deleted/rewritten documents
                    # convert positions of resulting docs back to ids
                    result.append((self.pos2id[pos], abs(score)))
                    if len(result) == self.topsims:
                        break
        return result

    def vec_by_id(self, docid):
        """Return indexed vector corresponding to document `docid`."""
        pos = self.id2pos[docid]
        return self.qindex.vector_by_id(pos)

    def sims_by_id(self, docid):
        """Find the most similar documents to the (already indexed) document with `docid`."""
        result = self.id2sims.get(docid, None)
        if result is None:
            self.qindex.num_best = self.topsims
            sims = self.qindex.similarity_by_id(self.id2pos[docid])
            result = self.sims2scores(sims)
        return result

    def sims_by_vec(self, vec, normalize=None):
        """
        Find the most similar documents to a given vector (=already processed document).
        """
        if normalize is None:

            #normalize = self.qindex.normalize
            normalize = self.qindex.norm

        #norm, self.qindex.normalize = self.qindex.normalize, normalize # store old value
        norm, self.qindex.norm = self.qindex.norm, normalize  # store old value

        self.qindex.num_best = self.topsims
        sims = self.qindex[vec]

        #self.qindex.normalize = norm # restore old value of qindex.normalize
        self.qindex.norm = norm  # restore old value of qindex.norm

        return self.sims2scores(sims)

    def merge(self, other):
        """Merge documents from the other index. Update precomputed similarities
        in the process."""

        #other.qindex.normalize, other.qindex.num_best = False, self.topsims
        other.qindex.norm, other.qindex.num_best = False, self.topsims

        # update precomputed "most similar" for old documents (in case some of
        # the new docs make it to the top-N for some of the old documents)
        logger.info("updating old precomputed values")
        pos, lenself = 0, len(self.qindex)
        for chunk in self.qindex.iter_chunks():
            for sims in other.qindex[chunk]:
                if pos in self.pos2id:
                    # ignore masked entries (deleted, overwritten documents)
                    docid = self.pos2id[pos]
                    sims = self.sims2scores(sims)
                    self.id2sims[docid] = merge_sims(self.id2sims[docid], sims,
                                                     self.topsims)
                pos += 1
                if pos % 10000 == 0:
                    logger.info("PROGRESS: updated doc #%i/%i" %
                                (pos, lenself))
        self.id2sims.sync()

        logger.info("merging fresh index into optimized one")
        pos, docids = 0, []
        for chunk in other.qindex.iter_chunks():
            for vec in chunk:
                if pos in other.pos2id:  # don't copy deleted documents
                    self.qindex.add_documents([vec])
                    docids.append(other.pos2id[pos])
                pos += 1
        self.qindex.save()
        self.update_ids(docids)

        logger.info("precomputing most similar for the fresh index")
        pos, lenother = 0, len(other.qindex)

        #norm, self.qindex.normalize = self.qindex.normalize, False
        norm, self.qindex.norm = self.qindex.norm, False

        topsims, self.qindex.num_best = self.qindex.num_best, self.topsims
        for chunk in other.qindex.iter_chunks():
            for sims in self.qindex[chunk]:
                if pos in other.pos2id:
                    # ignore masked entries (deleted, overwritten documents)
                    docid = other.pos2id[pos]
                    self.id2sims[docid] = self.sims2scores(sims)
                pos += 1
                if pos % 10000 == 0:
                    logger.info("PROGRESS: precomputed doc #%i/%i" %
                                (pos, lenother))

        #self.qindex.normalize, self.qindex.num_best = norm, topsims
        self.qindex.norm, self.qindex.num_best = norm, topsims

        self.id2sims.sync()

    def __len__(self):
        return len(self.id2pos)

    def __contains__(self, docid):
        return docid in self.id2pos

    def keys(self):
        return self.id2pos.keys()

    def __str__(self):
        return "SimIndex(%i docs, %i real size)" % (len(self), self.length)
class SimServer(object):
    """
    Top-level functionality for similarity services. A similarity server takes
    care of::

    1. creating semantic models
    2. indexing documents using these models
    3. finding the most similar documents in an index.

    An object of this class can be shared across network via Pyro, to answer remote
    client requests. It is thread safe. Using a server concurrently from multiple
    processes is safe for reading = answering similarity queries. Modifying
    (training/indexing) is realized via locking = serialized internally.
    """
    def __init__(self, basename, use_locks=False):
        """
        All data will be stored under directory `basename`. If there is a server
        there already, it will be loaded (resumed).

        The server object is stateless in RAM -- its state is defined entirely by its location.
        There is therefore no need to store the server object.
        """
        if not os.path.isdir(basename):
            raise ValueError("%r must be a writable directory" % basename)
        self.basename = basename
        self.use_locks = use_locks

        self.lock_update = threading.RLock(
        ) if use_locks else gensim.utils.nocm
        #self.lock_update = threading._RLock if use_locks else gensim.utils.nocm
        #self.lock_update = RLock() if use_locks else gensim.utils.nocm

        try:
            self.fresh_index = SimIndex.load(self.location('index_fresh'))
        except:
            logger.debug("starting a new fresh index")
            self.fresh_index = None
        try:
            self.opt_index = SimIndex.load(self.location('index_opt'))
        except:
            logger.debug("starting a new optimized index")
            self.opt_index = None
        try:
            self.model = SimModel.load(self.location('model'))
        except:
            self.model = None
        self.payload = SqliteDict(self.location('payload'),
                                  autocommit=True,
                                  journal_mode=JOURNAL_MODE)
        self.flush(save_index=False, save_model=False, clear_buffer=True)
        logger.info("loaded %s" % self)

    def location(self, name):
        return os.path.join(self.basename, name)

    @gensim.utils.synchronous('lock_update')
    def flush(self, save_index=False, save_model=False, clear_buffer=False):
        """Commit all changes, clear all caches."""
        if save_index:
            if self.fresh_index is not None:
                self.fresh_index.save(self.location('index_fresh'))
            if self.opt_index is not None:
                self.opt_index.save(self.location('index_opt'))
        if save_model:
            if self.model is not None:
                self.model.save(self.location('model'))
        self.payload.commit()
        if clear_buffer:
            if hasattr(self, 'fresh_docs'):
                try:
                    self.fresh_docs.terminate(
                    )  # erase all buffered documents + file on disk
                except:
                    pass
            self.fresh_docs = SqliteDict(
                journal_mode=JOURNAL_MODE
            )  # buffer defaults to a random location in temp
        self.fresh_docs.sync()

    def close(self):
        """Explicitly close open file handles, databases etc."""
        try:
            self.payload.close()
        except:
            pass
        try:
            self.model.close()
        except:
            pass
        try:
            self.fresh_index.close()
        except:
            pass
        try:
            self.opt_index.close()
        except:
            pass
        try:
            self.fresh_docs.terminate()
        except:
            pass

    def __del__(self):
        """When the server went out of scope, make an effort to close its DBs."""
        self.close()

    @gensim.utils.synchronous('lock_update')
    def buffer(self, documents):
        """
        Add a sequence of documents to be processed (indexed or trained on).

        Here, the documents are simply collected; real processing is done later,
        during the `self.index` or `self.train` calls.

        `buffer` can be called repeatedly; the result is the same as if it was
        called once, with a concatenation of all the partial document batches.
        The point is to save memory when sending large corpora over network: the
        entire `documents` must be serialized into RAM. See `utils.upload_chunked()`.

        A call to `flush()` clears this documents-to-be-processed buffer (`flush`
        is also implicitly called when you call `index()` and `train()`).
        """
        logger.info("adding documents to temporary buffer of %s" % (self))
        for doc in documents:
            docid = doc['id']
            #            logger.debug("buffering document %r" % docid)
            if docid in self.fresh_docs:
                logger.warning("asked to re-add id %r; rewriting old value" %
                               docid)
            self.fresh_docs[docid] = doc
        self.fresh_docs.sync()

    @gensim.utils.synchronous('lock_update')
    def train(self,
              corpus=None,
              method='auto',
              clear_buffer=True,
              params=None):
        """
        Create an indexing model. Will overwrite the model if it already exists.
        All indexes become invalid, because documents in them use a now-obsolete
        representation.

        The model is trained on documents previously entered via `buffer`,
        or directly on `corpus`, if specified.
        """
        if corpus is not None:
            # use the supplied corpus only (erase existing buffer, if any)
            self.flush(clear_buffer=True)
            self.buffer(corpus)
        if not self.fresh_docs:
            msg = "train called but no training corpus specified for %s" % self
            logger.error(msg)
            raise ValueError(msg)
        if method == 'auto':
            numdocs = len(self.fresh_docs)
            if numdocs < 1000:
                logging.warning(
                    "too few training documents; using simple log-entropy model instead of latent semantic indexing"
                )
                method = 'logentropy'
            else:
                method = 'lsi'
        if params is None:
            params = {}
        self.model = SimModel(self.fresh_docs, method=method, params=params)
        self.flush(save_model=True, clear_buffer=clear_buffer)

    @gensim.utils.synchronous('lock_update')
    def index(self, corpus=None, clear_buffer=True):
        """
        Permanently index all documents previously added via `buffer`, or
        directly index documents from `corpus`, if specified.

        The indexing model must already exist (see `train`) before this function
        is called.
        """
        if not self.model:
            msg = 'must initialize model for %s before indexing documents' % self.basename
            logger.error(msg)
            raise AttributeError(msg)

        if corpus is not None:
            # use the supplied corpus only (erase existing buffer, if any)
            self.flush(clear_buffer=True)
            self.buffer(corpus)

        if not self.fresh_docs:
            msg = "index called but no indexing corpus specified for %s" % self
            logger.error(msg)
            raise ValueError(msg)

        if not self.fresh_index:
            logger.info("starting a new fresh index for %s" % self)
            self.fresh_index = SimIndex(self.location('index_fresh'),
                                        self.model.num_features)
        self.fresh_index.index_documents(self.fresh_docs, self.model)
        if self.opt_index is not None:
            self.opt_index.delete(self.fresh_docs.keys())
        logger.info("storing document payloads")
        for docid in self.fresh_docs:
            payload = self.fresh_docs[docid].get('payload', None)
            if payload is None:
                # HACK: exit on first doc without a payload (=assume all docs have payload, or none does)
                break
            self.payload[docid] = payload
        self.flush(save_index=True, clear_buffer=clear_buffer)

    @gensim.utils.synchronous('lock_update')
    def optimize(self):
        """
        Precompute top similarities for all indexed documents. This speeds up
        `find_similar` queries by id (but not queries by fulltext).

        Internally, documents are moved from a fresh index (=no precomputed similarities)
        to an optimized index (precomputed similarities). Similarity queries always
        query both indexes, so this split is transparent to clients.

        If you add documents later via `index`, they go to the fresh index again.
        To precompute top similarities for these new documents too, simply call
        `optimize` again.

        """
        if self.fresh_index is None:
            logger.warning("optimize called but there are no new documents")
            return  # nothing to do!

        if self.opt_index is None:
            logger.info("starting a new optimized index for %s" % self)
            self.opt_index = SimIndex(self.location('index_opt'),
                                      self.model.num_features)

        self.opt_index.merge(self.fresh_index)
        self.fresh_index.terminate()  # delete old files
        self.fresh_index = None
        self.flush(save_index=True)

    @gensim.utils.synchronous('lock_update')
    def drop_index(self, keep_model=True):
        """Drop all indexed documents. If `keep_model` is False, also dropped the model."""
        modelstr = "" if keep_model else "and model "
        logger.info("deleting similarity index " + modelstr +
                    "from %s" % self.basename)

        # delete indexes
        for index in [self.fresh_index, self.opt_index]:
            if index is not None:
                index.terminate()
        self.fresh_index, self.opt_index = None, None

        # delete payload
        if self.payload is not None:
            self.payload.close()

            fname = self.location('payload')
            try:
                if os.path.exists(fname):
                    os.remove(fname)
                    logger.info("deleted %s" % fname)
            #except Exception, e:
            except Exception as e:
                logger.warning("failed to delete %s" % fname)
        self.payload = SqliteDict(self.location('payload'),
                                  autocommit=True,
                                  journal_mode=JOURNAL_MODE)

        # optionally, delete the model as well
        if not keep_model and self.model is not None:
            self.model.close()
            fname = self.location('model')
            try:
                if os.path.exists(fname):
                    os.remove(fname)
                    logger.info("deleted %s" % fname)
            #except Exception, e:
            except Exception as e:
                logger.warning("failed to delete %s" % fname)
            self.model = None
        self.flush(save_index=True, save_model=True, clear_buffer=True)

    @gensim.utils.synchronous('lock_update')
    def delete(self, docids):
        """Delete specified documents from the index."""
        logger.info("asked to drop %i documents" % len(docids))
        for index in [self.opt_index, self.fresh_index]:
            if index is not None:
                index.delete(docids)
        self.flush(save_index=True)

    def is_locked(self):
        #return self.use_locks and self.lock_update._RLock__count > 0
        #return self.use_locks
        #return self.use_locks and self.lock_update._RLock._count > 0
        return self.use_locks and self.lock_update._count > 0

    def vec_by_id(self, docid):
        for index in [self.opt_index, self.fresh_index]:
            if index is not None and docid in index:
                return index.vec_by_id(docid)

    def find_similar(self, doc, min_score=0.0, max_results=100):
        """
        Find `max_results` most similar articles in the index, each having similarity
        score of at least `min_score`. The resulting list may be shorter than `max_results`,
        in case there are not enough matching documents.

        `doc` is either a string (=document id, previously indexed) or a
        dict containing a 'tokens' key. These tokens are processed to produce a
        vector, which is then used as a query against the index.

        The similar documents are returned in decreasing similarity order, as
        `(doc_id, similarity_score, doc_payload)` 3-tuples. The payload returned
        is identical to what was supplied for this document during indexing.

        """
        logger.debug("received query call with %r" % doc)
        if self.is_locked():
            msg = "cannot query while the server is being updated"
            logger.error(msg)
            raise RuntimeError(msg)
        sims_opt, sims_fresh = None, None
        for index in [self.fresh_index, self.opt_index]:
            if index is not None:
                index.topsims = max_results
        #if isinstance(doc, basestring):
        if isinstance(doc, str):
            # query by direct document id
            docid = doc
            if self.opt_index is not None and docid in self.opt_index:
                sims_opt = self.opt_index.sims_by_id(docid)
                if self.fresh_index is not None:
                    vec = self.opt_index.vec_by_id(docid)
                    sims_fresh = self.fresh_index.sims_by_vec(vec,
                                                              normalize=False)
            elif self.fresh_index is not None and docid in self.fresh_index:
                sims_fresh = self.fresh_index.sims_by_id(docid)
                if self.opt_index is not None:
                    vec = self.fresh_index.vec_by_id(docid)
                    sims_opt = self.opt_index.sims_by_vec(vec, normalize=False)
            else:
                raise ValueError("document %r not in index" % docid)
        else:
            if 'topics' in doc:
                # user supplied vector directly => use that
                vec = gensim.matutils.any2sparse(doc['topics'])
            else:
                # query by an arbitrary text (=tokens) inside doc['tokens']
                vec = self.model.doc2vec(
                    doc)  # convert document (text) to vector
            if self.opt_index is not None:
                sims_opt = self.opt_index.sims_by_vec(vec)
            if self.fresh_index is not None:
                sims_fresh = self.fresh_index.sims_by_vec(vec)

        merged = merge_sims(sims_opt, sims_fresh)
        logger.debug(
            "got %s raw similars, pruning with max_results=%s, min_score=%s" %
            (len(merged), max_results, min_score))
        result = []
        for docid, score in merged:
            if score < min_score or 0 < max_results <= len(result):
                break
            result.append((docid, float(score), self.payload.get(docid, None)))
        return result

    #def find_similar(self, doc, min_score=0.0, max_results=100):
    def find_dissimilar(self, doc, max_score=1.0, max_results=100):
        """
        Find `max_results` most similar articles in the index, each having similarity
        score of at least `min_score`. The resulting list may be shorter than `max_results`,
        in case there are not enough matching documents.

        `doc` is either a string (=document id, previously indexed) or a
        dict containing a 'tokens' key. These tokens are processed to produce a
        vector, which is then used as a query against the index.

        The similar documents are returned in decreasing similarity order, as
        `(doc_id, similarity_score, doc_payload)` 3-tuples. The payload returned
        is identical to what was supplied for this document during indexing.

        """
        logger.debug("received query call with %r" % doc)
        if self.is_locked():
            msg = "cannot query while the server is being updated"
            logger.error(msg)
            raise RuntimeError(msg)
        sims_opt, sims_fresh = None, None
        for index in [self.fresh_index, self.opt_index]:
            if index is not None:

                #index.topsims = max_results
                index.topsims = 10000000

        #if isinstance(doc, basestring):
        if isinstance(doc, str):
            # query by direct document id
            docid = doc
            if self.opt_index is not None and docid in self.opt_index:
                sims_opt = self.opt_index.sims_by_id(docid)
                if self.fresh_index is not None:
                    vec = self.opt_index.vec_by_id(docid)
                    sims_fresh = self.fresh_index.sims_by_vec(vec,
                                                              normalize=False)
            elif self.fresh_index is not None and docid in self.fresh_index:
                sims_fresh = self.fresh_index.sims_by_id(docid)
                if self.opt_index is not None:
                    vec = self.fresh_index.vec_by_id(docid)
                    sims_opt = self.opt_index.sims_by_vec(vec, normalize=False)
            else:
                raise ValueError("document %r not in index" % docid)
        else:
            if 'topics' in doc:
                # user supplied vector directly => use that
                vec = gensim.matutils.any2sparse(doc['topics'])
            else:
                # query by an arbitrary text (=tokens) inside doc['tokens']
                vec = self.model.doc2vec(
                    doc)  # convert document (text) to vector
            if self.opt_index is not None:
                sims_opt = self.opt_index.sims_by_vec(vec)
            if self.fresh_index is not None:
                sims_fresh = self.fresh_index.sims_by_vec(vec)

        merged = merge_sims(sims_opt, sims_fresh)
        #merged.sort(reverse=False)
        merged.sort(key=lambda tup: tup[1], reverse=False)

        #logger.debug("got %s raw similars, pruning with max_results=%s, min_score=%s" %
        #    (len(merged), max_results, min_score))
        logger.debug(
            "got %s raw similars, pruning with max_results=%s, max_score=%s" %
            (len(merged), max_results, max_score))
        result = []
        #print("merged = ", merged)
        #print("len(merged) = ", len(merged))
        for docid, score in merged:
            #if score < min_score or 0 < max_results <= len(result):
            #if score > max_score or 0 < max_results <= len(result):
            if score > max_score:
                if len(result) >= max_results:
                    break
            #elif len(result) >= (10 * max_results):
            #elif len(result) >= (2 * max_results):
            elif len(result) >= (1 * max_results):
                break
            result.append((docid, float(score), self.payload.get(docid, None)))
        return result

    def __str__(self):
        return ("SimServer(loc=%r, fresh=%s, opt=%s, model=%s, buffer=%s)" %
                (self.basename, self.fresh_index, self.opt_index, self.model,
                 self.fresh_docs))

    def __len__(self):
        return sum(
            len(index) for index in [self.opt_index, self.fresh_index]
            if index is not None)

    def __contains__(self, docid):
        """Is document with `docid` in the index?"""
        return any(index is not None and docid in index
                   for index in [self.opt_index, self.fresh_index])

    def get_tfidf(self, *args, **kwargs):
        return self.model.get_tfidf(*args, **kwargs)

    def status(self):
        return str(self)

    def keys(self):
        """Return ids of all indexed documents."""
        result = []
        if self.fresh_index is not None:
            result += self.fresh_index.keys()
        if self.opt_index is not None:
            result += self.opt_index.keys()
        return result

    def memdebug(self):
        from guppy import hpy
        return str(hpy().heap())