Ejemplo n.º 1
0
    def __init__(self,
                 output_prefix,
                 corpus,
                 num_features,
                 num_best=None,
                 chunksize=256,
                 shardsize=32768,
                 norm='l2'):
        """
        Construct the index from `corpus`. The index can be later extended by calling
        the `add_documents` method. **Note**: documents are split (internally, transparently)
        into shards of `shardsize` documents each, converted to a matrix, for faster BLAS calls.
        Each shard is stored to disk under `output_prefix.shard_number` (=you need write
        access to that location). If you don't specify an output prefix, a random
        filename in temp will be used.

        `shardsize` should be chosen so that a `shardsize x chunksize` matrix of floats
        fits comfortably into main memory.

        `num_features` is the number of features in the `corpus` (e.g. size of the
        dictionary, or the number of latent topics for latent semantic models).

        `norm` is the user-chosen normalization to use. Accepted values are: 'l1' and 'l2'.

        If `num_best` is left unspecified, similarity queries will return a full
        vector with one float for every document in the index:

        >>> index = Similarity('/path/to/index', corpus, num_features=400) # if corpus has 7 documents...
        >>> index[query] # ... then result will have 7 floats
        [0.0, 0.0, 0.2, 0.13, 0.8, 0.0, 0.1]

        If `num_best` is set, queries return only the `num_best` most similar documents,
        always leaving out documents for which the similarity is 0.
        If the input vector itself only has features with zero values (=the sparse
        representation is empty), the returned list will always be empty.

        >>> index.num_best = 3
        >>> index[query] # return at most "num_best" of `(index_of_document, similarity)` tuples
        [(4, 0.8), (2, 0.13), (3, 0.13)]

        You can also override `num_best` dynamically, simply by setting e.g.
        `self.num_best = 10` before doing a query.

        """
        if output_prefix is None:
            # undocumented feature: set output_prefix=None to create the server in temp
            self.output_prefix = utils.randfname(prefix='simserver')
        else:
            self.output_prefix = output_prefix
        logger.info("starting similarity index under %s", self.output_prefix)
        self.num_features = num_features
        self.num_best = num_best
        self.norm = norm
        self.chunksize = int(chunksize)
        self.shardsize = shardsize
        self.shards = []
        self.fresh_docs, self.fresh_nnz = [], 0

        if corpus is not None:
            self.add_documents(corpus)
Ejemplo n.º 2
0
    def __init__(self,
                 output_prefix,
                 corpus,
                 num_features,
                 num_best=None,
                 chunksize=256,
                 shardsize=32768,
                 norm='l2'):
        """

        Parameters
        ----------
        output_prefix : str
            Prefix for shard filename. If None, a random filename in temp will be used.
        corpus : iterable of list of (int, number)
            Corpus in streamed Gensim bag-of-words format.
        num_features : int
            Size of the dictionary (number of features).
        num_best : int, optional
            If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
            Otherwise, return a full vector with one float for every document in the index.
        chunksize : int, optional
            Size of query chunks. Used internally when the query is an entire corpus.
        shardsize : int, optional
            Maximum shard size, in documents. Choose a value so that a `shardsize x chunksize` matrix of floats fits
            comfortably into your RAM.
        norm : {'l1', 'l2'}, optional
            Normalization to use.

        Notes
        -----
        Documents are split (internally, transparently) into shards of `shardsize` documents each, and each shard
        converted to a matrix, for faster BLAS calls. Each shard is stored to disk under `output_prefix.shard_number`.

        If you don't specify an output prefix, a random filename in temp will be used.

        If your entire index fits in memory (~1 million documents per 1GB of RAM), you can also use the
        :class:`~gensim.similarities.docsim.MatrixSimilarity` or
        :class:`~gensim.similarities.docsim.SparseMatrixSimilarity` classes directly.
        These are more simple but do not scale as well (they keep the entire index in RAM, no sharding).
        They also do not support adding new document dynamically.

        """
        if output_prefix is None:
            # undocumented feature: set output_prefix=None to create the server in temp
            self.output_prefix = utils.randfname(prefix='simserver')
        else:
            self.output_prefix = output_prefix
        logger.info("starting similarity index under %s", self.output_prefix)
        self.num_features = num_features
        self.num_best = num_best
        self.norm = norm
        self.chunksize = int(chunksize)
        self.shardsize = shardsize
        self.shards = []
        self.fresh_docs, self.fresh_nnz = [], 0

        if corpus is not None:
            self.add_documents(corpus)
Ejemplo n.º 3
0
    def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize=256, shardsize=32768,
                 use_reverse_index=False):
        """
        Construct the index from `corpus`. The index can be later extended by calling
        the `add_documents` method. **Note**: documents are split (internally, transparently)
        into shards of `shardsize` documents each, converted to a matrix, for faster BLAS calls.
        Each shard is stored to disk under `output_prefix.shard_number` (=you need write
        access to that location). If you don't specify an output prefix, a random
        filename in temp will be used.

        `shardsize` should be chosen so that a `shardsize x chunksize` matrix of floats
        fits comfortably into main memory.

        `num_features` is the number of features in the `corpus` (e.g. size of the
        dictionary, or the number of latent topics for latent semantic models).

        If `num_best` is left unspecified, similarity queries will return a full
        vector with one float for every document in the index:

        >>> index = Similarity('/path/to/index', corpus, num_features=400) # if corpus has 7 documents...
        >>> index[query] # ... then result will have 7 floats
        [0.0, 0.0, 0.2, 0.13, 0.8, 0.0, 0.1]

        If `num_best` is set, queries return only the `num_best` most similar documents,
        always leaving out documents for which the similarity is 0.
        If the input vector itself only has features with zero values (=the sparse
        representation is empty), the returned list will always be empty.

        >>> index.num_best = 3
        >>> index[query] # return at most "num_best" of `(index_of_document, similarity)` tuples
        [(4, 0.8), (2, 0.13), (3, 0.13)]

        You can also override `num_best` dynamically, simply by setting e.g.
        `self.num_best = 10` before doing a query.

        """
        if output_prefix is None:
            # undocumented feature: set output_prefix=None to create the server in temp
            self.output_prefix = utils.randfname(prefix='simserver')
        else:
            self.output_prefix = output_prefix
        logger.info("starting similarity index under %s" % self.output_prefix)
        self.num_features = num_features
        self.num_best = num_best
        self.normalize = True
        self.chunksize = int(chunksize)
        self.shardsize = shardsize
        self.shards = []
        self.fresh_docs, self.fresh_nnz = [], 0
        self.use_reverse_index = use_reverse_index

        """
        if self.use_reverse_index:
            self.reverse_index = ReverseIndex(num_documents = len(corpus), num_features = self.num_features)
        """

        if corpus is not None:
            self.add_documents(corpus)
Ejemplo n.º 4
0
    def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize=256, shardsize=32768, norm='l2'):
        """

        Parameters
        ----------
        output_prefix : str
            Prefix for shard filename. If None, a random filename in temp will be used.
        corpus : iterable of list of (int, number)
            Corpus in streamed Gensim bag-of-words format.
        num_features : int
            Size of the dictionary (number of features).
        num_best : int, optional
            If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
            Otherwise, return a full vector with one float for every document in the index.
        chunksize : int, optional
            Size of query chunks. Used internally when the query is an entire corpus.
        shardsize : int, optional
            Maximum shard size, in documents. Choose a value so that a `shardsize x chunksize` matrix of floats fits
            comfortably into your RAM.
        norm : {'l1', 'l2'}, optional
            Normalization to use.

        Notes
        -----
        Documents are split (internally, transparently) into shards of `shardsize` documents each, and each shard
        converted to a matrix, for faster BLAS calls. Each shard is stored to disk under `output_prefix.shard_number`.

        If you don't specify an output prefix, a random filename in temp will be used.

        If your entire index fits in memory (~1 million documents per 1GB of RAM), you can also use the
        :class:`~gensim.similarities.docsim.MatrixSimilarity` or
        :class:`~gensim.similarities.docsim.SparseMatrixSimilarity` classes directly.
        These are more simple but do not scale as well (they keep the entire index in RAM, no sharding).
        They also do not support adding new document dynamically.

        """
        if output_prefix is None:
            # undocumented feature: set output_prefix=None to create the server in temp
            self.output_prefix = utils.randfname(prefix='simserver')
        else:
            self.output_prefix = output_prefix
        logger.info("starting similarity index under %s", self.output_prefix)
        self.num_features = num_features
        self.num_best = num_best
        self.norm = norm
        self.chunksize = int(chunksize)
        self.shardsize = shardsize
        self.shards = []
        self.fresh_docs, self.fresh_nnz = [], 0

        if corpus is not None:
            self.add_documents(corpus)
Ejemplo n.º 5
0
    def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize=1024, shardsize=32768):
        """
        Construct the index from `corpus`. The index can be later extended by calling
        the `add_documents` method. Documents are split into shards of `shardsize`
        documents each, converted to a matrix (for fast BLAS calls) and stored to disk
        under `output_prefix.shard_number` (=you need write access to that location).
        If you don't specify an output prefix, a random filename in temp will be used.

        `shardsize` should be chosen so that a `shardsize x chunksize` matrix of floats
        fits comfortably into main memory.

        `num_features` is the number of features in the `corpus` (e.g. size of the
        dictionary, or the number of latent topics for latent semantic models).

        If `num_best` is left unspecified, similarity queries will return a full
        vector with one float for every document in the index:

        >>> index = Similarity('/path/to/index', corpus, num_features=400) # if corpus has 7 documents...
        >>> index[query] # ... then result will have 7 floats
        [0.0, 0.0, 0.2, 0.13, 0.8, 0.0, 0.1]

        If `num_best` is set, queries return only the `num_best` most similar documents:

        >>> index.num_best = 3
        >>> index[query] # return at most "num_best" of `(index_of_document, similarity)` tuples
        [(4, 0.8), (2, 0.13), (3, 0.13)]

        """
        if output_prefix is None:
            # undocumented feature: set output_prefix=None to create the server in temp
            self.output_prefix = utils.randfname(prefix='simserver')
        else:
            self.output_prefix = output_prefix
        self.num_features = num_features
        self.num_best = num_best
        self.normalize = True
        self.chunksize = int(chunksize)
        self.shardsize = shardsize
        self.shards = []
        self.fresh_docs, self.fresh_nnz = [], 0

        if corpus is not None:
            self.add_documents(corpus)