Esempio n. 1
0
    def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None,
                 num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False):
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize
        self.maintain_sparsity = maintain_sparsity

        if corpus is not None:
            logger.info("creating sparse index")

            # iterate over input corpus, populating the sparse index matrix
            try:
                # use the more efficient corpus generation version, if the input
                # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes).
                num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz
                logger.debug("using efficient sparse index creation")
            except AttributeError:
                # no MmCorpus, use the slower version (or maybe user supplied the
                # num_* params in constructor)
                pass
            if num_features is not None:
                # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity
                num_terms = num_features
            if num_terms is None:
                raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly")
            corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else
                      (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else
                       matutils.unitvec(v)) for v in corpus)
            self.index = matutils.corpus2csc(
                corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz,
                dtype=dtype, printprogress=10000).T

            # convert to Compressed Sparse Row for efficient row slicing and multiplications
            self.index = self.index.tocsr()  # currently no-op, CSC.T is already CSR
            logger.info("created %r", self.index)
Esempio n. 2
0
    def test_scipy2scipy_clipped(self):
        # Test for scipy vector/row
        vec = [0.8, 0.2, 0.0, 0.0, -0.1, -0.15]
        expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)]
        vec_scipy = scipy.sparse.csr_matrix(vec)
        vec_scipy_clipped = matutils.scipy2scipy_clipped(vec_scipy, topn=3)
        self.assertTrue(scipy.sparse.issparse(vec_scipy_clipped))
        self.assertTrue(matutils.scipy2sparse(vec_scipy_clipped), expected)

        # Test for scipy matrix
        vec = [0.8, 0.2, 0.0, 0.0, -0.1, -0.15]
        expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)]
        matrix_scipy = scipy.sparse.csr_matrix([vec] * 3)
        matrix_scipy_clipped = matutils.scipy2scipy_clipped(matrix_scipy, topn=3)
        self.assertTrue(scipy.sparse.issparse(matrix_scipy_clipped))
        self.assertTrue([matutils.scipy2sparse(x) for x in matrix_scipy_clipped], [expected] * 3)
Esempio n. 3
0
    def test_scipy2scipy_clipped(self):
        # Test for scipy vector/row
        vec = [0.8, 0.2, 0.0, 0.0, -0.1, -0.15]
        expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)]
        vec_scipy = scipy.sparse.csr_matrix(vec)
        vec_scipy_clipped = matutils.scipy2scipy_clipped(vec_scipy, topn=3)
        self.assertTrue(scipy.sparse.issparse(vec_scipy_clipped))
        self.assertTrue(matutils.scipy2sparse(vec_scipy_clipped), expected)

        # Test for scipy matrix
        vec = [0.8, 0.2, 0.0, 0.0, -0.1, -0.15]
        expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)]
        matrix_scipy = scipy.sparse.csr_matrix([vec] * 3)
        matrix_scipy_clipped = matutils.scipy2scipy_clipped(matrix_scipy, topn=3)
        self.assertTrue(scipy.sparse.issparse(matrix_scipy_clipped))
        self.assertTrue([matutils.scipy2sparse(x) for x in matrix_scipy_clipped], [expected] * 3)
Esempio n. 4
0
    def testMaintainSparsityWithNumBest(self):
        """Tests that sparsity is correctly maintained when maintain_sparsity=True and num_best is not None"""
        num_features = len(dictionary)

        index = self.cls(corpus, num_features=num_features, maintain_sparsity=False, num_best=3)
        dense_topn_sims = index[corpus]

        index = self.cls(corpus, num_features=num_features, maintain_sparsity=True, num_best=3)
        scipy_topn_sims = index[corpus]

        self.assertFalse(scipy.sparse.issparse(dense_topn_sims))
        self.assertTrue(scipy.sparse.issparse(scipy_topn_sims))
        self.assertEqual(dense_topn_sims, [matutils.scipy2sparse(v) for v in scipy_topn_sims])
Esempio n. 5
0
    def testMaintainSparsityWithNumBest(self):
        """Tests that sparsity is correctly maintained when maintain_sparsity=True and num_best is not None"""
        num_features = len(dictionary)

        index = self.cls(corpus, num_features=num_features, maintain_sparsity=False, num_best=3)
        dense_topn_sims = index[corpus]

        index = self.cls(corpus, num_features=num_features, maintain_sparsity=True, num_best=3)
        scipy_topn_sims = index[corpus]

        self.assertFalse(scipy.sparse.issparse(dense_topn_sims))
        self.assertTrue(scipy.sparse.issparse(scipy_topn_sims))
        self.assertEqual(dense_topn_sims, [matutils.scipy2sparse(v) for v in scipy_topn_sims])
Esempio n. 6
0
    def __init__(self,
                 corpus,
                 num_features=None,
                 num_terms=None,
                 num_docs=None,
                 num_nnz=None,
                 num_best=None,
                 chunksize=500,
                 dtype=numpy.float32,
                 maintain_sparsity=False):
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize
        self.maintain_sparsity = maintain_sparsity

        if corpus is not None:
            logger.info("creating sparse index")

            # iterate over input corpus, populating the sparse index matrix
            try:
                # use the more efficient corpus generation version, if the input
                # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes).
                num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz
                logger.debug("using efficient sparse index creation")
            except AttributeError:
                # no MmCorpus, use the slower version (or maybe user supplied the
                # num_* params in constructor)
                pass
            if num_features is not None:
                # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity
                num_terms = num_features
            if num_terms is None:
                raise ValueError(
                    "refusing to guess the number of sparse features: specify num_features explicitly"
                )
            corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else
                      (matutils.full2sparse(v) if isinstance(v, numpy.ndarray)
                       else matutils.unitvec(v)) for v in corpus)
            self.index = matutils.corpus2csc(corpus,
                                             num_terms=num_terms,
                                             num_docs=num_docs,
                                             num_nnz=num_nnz,
                                             dtype=dtype,
                                             printprogress=10000).T

            # convert to Compressed Sparse Row for efficient row slicing and multiplications
            self.index = self.index.tocsr(
            )  # currently no-op, CSC.T is already CSR
            logger.info("created %r", self.index)
Esempio n. 7
0
    def __init__(self,
                 corpus,
                 num_features=None,
                 num_terms=None,
                 num_docs=None,
                 num_nnz=None,
                 num_best=None,
                 chunksize=500,
                 dtype=numpy.float32,
                 maintain_sparsity=False):
        """

        Parameters
        ----------
        corpus: iterable of list of (int, float)
            A list of documents in the BoW format.
        num_features : int, optional
            Size of the dictionary. Must be either specified, or present in `corpus.num_terms`.
        num_terms : int, optional
            Alias for `num_features`, you can use either.
        num_docs : int, optional
            Number of documents in `corpus`. Will be calculated if not provided.
        num_nnz : int, optional
            Number of non-zero elements in `corpus`. Will be calculated if not provided.
        num_best : int, optional
            If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
            Otherwise, return a full vector with one float for every document in the index.
        chunksize : int, optional
            Size of query chunks. Used internally when the query is an entire corpus.
        dtype : numpy.dtype, optional
            Data type of the internal matrix.
        maintain_sparsity : bool, optional
            Return sparse arrays from :meth:`~gensim.similarities.docsim.SparseMatrixSimilarity.get_similarities`?

        """
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize
        self.maintain_sparsity = maintain_sparsity

        if corpus is not None:
            logger.info("creating sparse index")

            # iterate over input corpus, populating the sparse index matrix
            try:
                # use the more efficient corpus generation version, if the input
                # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes).
                num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz
                logger.debug("using efficient sparse index creation")
            except AttributeError:
                # no MmCorpus, use the slower version (or maybe user supplied the
                # num_* params in constructor)
                pass
            if num_features is not None:
                # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity
                num_terms = num_features
            if num_terms is None:
                raise ValueError(
                    "refusing to guess the number of sparse features: specify num_features explicitly"
                )
            corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else
                      (matutils.full2sparse(v) if isinstance(v, numpy.ndarray)
                       else matutils.unitvec(v)) for v in corpus)
            self.index = matutils.corpus2csc(corpus,
                                             num_terms=num_terms,
                                             num_docs=num_docs,
                                             num_nnz=num_nnz,
                                             dtype=dtype,
                                             printprogress=10000).T

            # convert to Compressed Sparse Row for efficient row slicing and multiplications
            self.index = self.index.tocsr(
            )  # currently no-op, CSC.T is already CSR
            logger.info("created %r", self.index)
Esempio n. 8
0
    def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None,
                 num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False):
        """

        Parameters
        ----------
        corpus: iterable of list of (int, float)
            A list of documents in the BoW format.
        num_features : int, optional
            Size of the dictionary. Must be either specified, or present in `corpus.num_terms`.
        num_terms : int, optional
            Alias for `num_features`, you can use either.
        num_docs : int, optional
            Number of documents in `corpus`. Will be calculated if not provided.
        num_nnz : int, optional
            Number of non-zero elements in `corpus`. Will be calculated if not provided.
        num_best : int, optional
            If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
            Otherwise, return a full vector with one float for every document in the index.
        chunksize : int, optional
            Size of query chunks. Used internally when the query is an entire corpus.
        dtype : numpy.dtype, optional
            Data type of the internal matrix.
        maintain_sparsity : bool, optional
            Return sparse arrays from :meth:`~gensim.similarities.docsim.SparseMatrixSimilarity.get_similarities`?

        """
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize
        self.maintain_sparsity = maintain_sparsity

        if corpus is not None:
            logger.info("creating sparse index")

            # iterate over input corpus, populating the sparse index matrix
            try:
                # use the more efficient corpus generation version, if the input
                # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes).
                num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz
                logger.debug("using efficient sparse index creation")
            except AttributeError:
                # no MmCorpus, use the slower version (or maybe user supplied the
                # num_* params in constructor)
                pass
            if num_features is not None:
                # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity
                num_terms = num_features
            if num_terms is None:
                raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly")
            corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else
                      (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else
                       matutils.unitvec(v)) for v in corpus)
            self.index = matutils.corpus2csc(
                corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz,
                dtype=dtype, printprogress=10000
            ).T

            # convert to Compressed Sparse Row for efficient row slicing and multiplications
            self.index = self.index.tocsr()  # currently no-op, CSC.T is already CSR
            logger.info("created %r", self.index)