def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None, num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False): self.num_best = num_best self.normalize = True self.chunksize = chunksize self.maintain_sparsity = maintain_sparsity if corpus is not None: logger.info("creating sparse index") # iterate over input corpus, populating the sparse index matrix try: # use the more efficient corpus generation version, if the input # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes). num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz logger.debug("using efficient sparse index creation") except AttributeError: # no MmCorpus, use the slower version (or maybe user supplied the # num_* params in constructor) pass if num_features is not None: # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity num_terms = num_features if num_terms is None: raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly") corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else matutils.unitvec(v)) for v in corpus) self.index = matutils.corpus2csc( corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz, dtype=dtype, printprogress=10000).T # convert to Compressed Sparse Row for efficient row slicing and multiplications self.index = self.index.tocsr() # currently no-op, CSC.T is already CSR logger.info("created %r", self.index)
def test_scipy2scipy_clipped(self): # Test for scipy vector/row vec = [0.8, 0.2, 0.0, 0.0, -0.1, -0.15] expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)] vec_scipy = scipy.sparse.csr_matrix(vec) vec_scipy_clipped = matutils.scipy2scipy_clipped(vec_scipy, topn=3) self.assertTrue(scipy.sparse.issparse(vec_scipy_clipped)) self.assertTrue(matutils.scipy2sparse(vec_scipy_clipped), expected) # Test for scipy matrix vec = [0.8, 0.2, 0.0, 0.0, -0.1, -0.15] expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)] matrix_scipy = scipy.sparse.csr_matrix([vec] * 3) matrix_scipy_clipped = matutils.scipy2scipy_clipped(matrix_scipy, topn=3) self.assertTrue(scipy.sparse.issparse(matrix_scipy_clipped)) self.assertTrue([matutils.scipy2sparse(x) for x in matrix_scipy_clipped], [expected] * 3)
def test_scipy2scipy_clipped(self): # Test for scipy vector/row vec = [0.8, 0.2, 0.0, 0.0, -0.1, -0.15] expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)] vec_scipy = scipy.sparse.csr_matrix(vec) vec_scipy_clipped = matutils.scipy2scipy_clipped(vec_scipy, topn=3) self.assertTrue(scipy.sparse.issparse(vec_scipy_clipped)) self.assertTrue(matutils.scipy2sparse(vec_scipy_clipped), expected) # Test for scipy matrix vec = [0.8, 0.2, 0.0, 0.0, -0.1, -0.15] expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)] matrix_scipy = scipy.sparse.csr_matrix([vec] * 3) matrix_scipy_clipped = matutils.scipy2scipy_clipped(matrix_scipy, topn=3) self.assertTrue(scipy.sparse.issparse(matrix_scipy_clipped)) self.assertTrue([matutils.scipy2sparse(x) for x in matrix_scipy_clipped], [expected] * 3)
def testMaintainSparsityWithNumBest(self): """Tests that sparsity is correctly maintained when maintain_sparsity=True and num_best is not None""" num_features = len(dictionary) index = self.cls(corpus, num_features=num_features, maintain_sparsity=False, num_best=3) dense_topn_sims = index[corpus] index = self.cls(corpus, num_features=num_features, maintain_sparsity=True, num_best=3) scipy_topn_sims = index[corpus] self.assertFalse(scipy.sparse.issparse(dense_topn_sims)) self.assertTrue(scipy.sparse.issparse(scipy_topn_sims)) self.assertEqual(dense_topn_sims, [matutils.scipy2sparse(v) for v in scipy_topn_sims])
def testMaintainSparsityWithNumBest(self): """Tests that sparsity is correctly maintained when maintain_sparsity=True and num_best is not None""" num_features = len(dictionary) index = self.cls(corpus, num_features=num_features, maintain_sparsity=False, num_best=3) dense_topn_sims = index[corpus] index = self.cls(corpus, num_features=num_features, maintain_sparsity=True, num_best=3) scipy_topn_sims = index[corpus] self.assertFalse(scipy.sparse.issparse(dense_topn_sims)) self.assertTrue(scipy.sparse.issparse(scipy_topn_sims)) self.assertEqual(dense_topn_sims, [matutils.scipy2sparse(v) for v in scipy_topn_sims])
def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None, num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False): self.num_best = num_best self.normalize = True self.chunksize = chunksize self.maintain_sparsity = maintain_sparsity if corpus is not None: logger.info("creating sparse index") # iterate over input corpus, populating the sparse index matrix try: # use the more efficient corpus generation version, if the input # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes). num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz logger.debug("using efficient sparse index creation") except AttributeError: # no MmCorpus, use the slower version (or maybe user supplied the # num_* params in constructor) pass if num_features is not None: # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity num_terms = num_features if num_terms is None: raise ValueError( "refusing to guess the number of sparse features: specify num_features explicitly" ) corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else matutils.unitvec(v)) for v in corpus) self.index = matutils.corpus2csc(corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz, dtype=dtype, printprogress=10000).T # convert to Compressed Sparse Row for efficient row slicing and multiplications self.index = self.index.tocsr( ) # currently no-op, CSC.T is already CSR logger.info("created %r", self.index)
def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None, num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False): """ Parameters ---------- corpus: iterable of list of (int, float) A list of documents in the BoW format. num_features : int, optional Size of the dictionary. Must be either specified, or present in `corpus.num_terms`. num_terms : int, optional Alias for `num_features`, you can use either. num_docs : int, optional Number of documents in `corpus`. Will be calculated if not provided. num_nnz : int, optional Number of non-zero elements in `corpus`. Will be calculated if not provided. num_best : int, optional If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0. Otherwise, return a full vector with one float for every document in the index. chunksize : int, optional Size of query chunks. Used internally when the query is an entire corpus. dtype : numpy.dtype, optional Data type of the internal matrix. maintain_sparsity : bool, optional Return sparse arrays from :meth:`~gensim.similarities.docsim.SparseMatrixSimilarity.get_similarities`? """ self.num_best = num_best self.normalize = True self.chunksize = chunksize self.maintain_sparsity = maintain_sparsity if corpus is not None: logger.info("creating sparse index") # iterate over input corpus, populating the sparse index matrix try: # use the more efficient corpus generation version, if the input # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes). num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz logger.debug("using efficient sparse index creation") except AttributeError: # no MmCorpus, use the slower version (or maybe user supplied the # num_* params in constructor) pass if num_features is not None: # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity num_terms = num_features if num_terms is None: raise ValueError( "refusing to guess the number of sparse features: specify num_features explicitly" ) corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else matutils.unitvec(v)) for v in corpus) self.index = matutils.corpus2csc(corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz, dtype=dtype, printprogress=10000).T # convert to Compressed Sparse Row for efficient row slicing and multiplications self.index = self.index.tocsr( ) # currently no-op, CSC.T is already CSR logger.info("created %r", self.index)
def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None, num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False): """ Parameters ---------- corpus: iterable of list of (int, float) A list of documents in the BoW format. num_features : int, optional Size of the dictionary. Must be either specified, or present in `corpus.num_terms`. num_terms : int, optional Alias for `num_features`, you can use either. num_docs : int, optional Number of documents in `corpus`. Will be calculated if not provided. num_nnz : int, optional Number of non-zero elements in `corpus`. Will be calculated if not provided. num_best : int, optional If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0. Otherwise, return a full vector with one float for every document in the index. chunksize : int, optional Size of query chunks. Used internally when the query is an entire corpus. dtype : numpy.dtype, optional Data type of the internal matrix. maintain_sparsity : bool, optional Return sparse arrays from :meth:`~gensim.similarities.docsim.SparseMatrixSimilarity.get_similarities`? """ self.num_best = num_best self.normalize = True self.chunksize = chunksize self.maintain_sparsity = maintain_sparsity if corpus is not None: logger.info("creating sparse index") # iterate over input corpus, populating the sparse index matrix try: # use the more efficient corpus generation version, if the input # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes). num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz logger.debug("using efficient sparse index creation") except AttributeError: # no MmCorpus, use the slower version (or maybe user supplied the # num_* params in constructor) pass if num_features is not None: # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity num_terms = num_features if num_terms is None: raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly") corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else matutils.unitvec(v)) for v in corpus) self.index = matutils.corpus2csc( corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz, dtype=dtype, printprogress=10000 ).T # convert to Compressed Sparse Row for efficient row slicing and multiplications self.index = self.index.tocsr() # currently no-op, CSC.T is already CSR logger.info("created %r", self.index)