Esempio n. 1
0
    def __getitem__(self, bow, scaled=False, chunksize=512):
        """
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.

        This is done by folding input document into the latent topic space.
        """
        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform 256 documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # 256 smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]
        vec = matutils.corpus2csc(bow, num_terms=self.num_terms)

        topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x
        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x

        # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist.flat)
        else:
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
Esempio n. 2
0
    def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None,
                 num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False):
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize
        self.maintain_sparsity = maintain_sparsity

        if corpus is not None:
            logger.info("creating sparse index")

            # iterate over input corpus, populating the sparse index matrix
            try:
                # use the more efficient corpus generation version, if the input
                # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes).
                num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz
                logger.debug("using efficient sparse index creation")
            except AttributeError:
                # no MmCorpus, use the slower version (or maybe user supplied the
                # num_* params in constructor)
                pass
            if num_features is not None:
                # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity
                num_terms = num_features
            if num_terms is None:
                raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly")
            corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else
                      (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else
                       matutils.unitvec(v)) for v in corpus)
            self.index = matutils.corpus2csc(
                corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz,
                dtype=dtype, printprogress=10000).T

            # convert to Compressed Sparse Row for efficient row slicing and multiplications
            self.index = self.index.tocsr()  # currently no-op, CSC.T is already CSR
            logger.info("created %r", self.index)
Esempio n. 3
0
    def __init__(self, uci_dir, dictionary, n_topics):
        bv = artm.BatchVectorizer(data_format='bow_uci', data_path=uci_dir, collection_name='corpus',
                                  target_folder=uci_dir + '/artm_batches')
        bv_dict = bv.dictionary

        logging.info("Fitting the ARTM model")
        model = artm.ARTM(dictionary=bv_dict, num_topics=n_topics)

        model.fit_offline(batch_vectorizer=bv, num_collection_passes=10)

        logging.info("Processing word-topic matrices")

        # Create a new word-topic matrix according to dictionary indices
        self.phi = np.zeros(model.phi_.shape, dtype=np.float64)
        for word, vec in model.phi_.iterrows():
            idx = dictionary.token2id[word[1]]
            self.phi[idx, :] = vec

        logging.info("Building the index for ARTM")
        corpus = model.transform(bv).T.sort_index()
        corpus = [matutils.full2sparse(row) for index, row in corpus.iterrows()]
        self.index = similarities.MatrixSimilarity(corpus, num_features=n_topics, num_best=self.N_BEST)

        self.model = model
        self.dictionary = dictionary
Esempio n. 4
0
 def train(self, read_article_ids = None, unread_article_ids = None):
     #Load user feedback if needed
     if read_article_ids is None:
         read_article_ids = (r.article.id for r in ReadArticleFeedback.objects(user_id = self.user.id).only("article"))
         
     user_feedback = Article.objects(id__in = read_article_ids)
     
     #TODO: cluster feedback articles and save more than one profile
     
     num_loaded_articles = 0
     centroid = numpy.zeros(self.num_features_, dtype=numpy.float32)
     
     for article in user_feedback:
         try:
             article_features_as_full_vec = self.get_features(article)
         except Exception as inst:
             logger.error("Could not get features for article %s: %s" %
                          (article.id, inst))
             continue
         
         #do we need this?
         tmp_doc = matutils.unitvec(article_features_as_full_vec)
         
         #add up tmp_doc
         centroid = numpy.add(centroid, tmp_doc)
         num_loaded_articles += 1 
         
     #average each element
     if num_loaded_articles != 0:
         centroid = centroid / num_loaded_articles
         
     centroid = matutils.full2sparse(centroid)
     
     #set user model data
     self.user_model_features = [centroid]
Esempio n. 5
0
    def __getitem__(self, bow, scaled=False, chunksize=512):
        """
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.
        This is done by folding input document into the latent topic space.
        If `scaled` is set, scale topics by the inverse of singular values (default: no scaling).
        """
        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # `chunksize` smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]

        # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication
        vec = matutils.corpus2csc(bow,
                                  num_terms=self.num_terms,
                                  dtype=self.projection.u.dtype)
        topic_dist = (
            vec.T *
            self.projection.u[:, :self.num_topics]).T  # (x^T * u).T = u^-1 * x

        # # convert input to dense, then do dense * dense multiplication
        # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), but consumes more memory
        # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow))
        # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec)

        # # use np's advanced indexing to simulate sparse * dense
        # # ± same speed again
        # u = self.projection.u[:, :self.num_topics]
        # topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype)
        # for vecno, vec in enumerate(bow):
        #     indices, data = zip(*vec) if vec else ([], [])
        #     topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype))

        if not is_corpus:
            # convert back from matrix into a 1d vec
            topic_dist = topic_dist.reshape(-1)

        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]
                          ) * topic_dist  # s^-1 * u^-1 * x

        # convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist)
        else:
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
Esempio n. 6
0
    def __getitem__(self, bow, scaled=False, chunksize=512):
        """
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.

        This is done by folding input document into the latent topic space.

        If `scaled` is set, scale topics by the inverse of singular values (default: no scaling).

        """
        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # `chunksize` smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]

        # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication
        vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype)
        topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T  # (x^T * u).T = u^-1 * x

        # # convert input to dense, then do dense * dense multiplication
        # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse),
        # but consumes more memory
        # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow))
        # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec)

        # # use np's advanced indexing to simulate sparse * dense
        # # ± same speed again
        # u = self.projection.u[:, :self.num_topics]
        # topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype)
        # for vecno, vec in enumerate(bow):
        #     indices, data = zip(*vec) if vec else ([], [])
        #     topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype))

        if not is_corpus:
            # convert back from matrix into a 1d vec
            topic_dist = topic_dist.reshape(-1)

        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist  # s^-1 * u^-1 * x

        # convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist)
        else:
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
Esempio n. 7
0
    def __iter__(self):
        """The function that defines a corpus.

        Iterating over the corpus must yield sparse vectors, one for each
        document.
        """
        for i, image in enumerate(self.get_images()):
            logging.debug('__iter__ Yielding image no. {0}'.format(i))
            yield matutils.full2sparse(image, self.eps)
Esempio n. 8
0
def sparse_mean(sparse_vectors_list):
    dense_vectors_list = []

    for vec in sparse_vectors_list:
        dense_vectors_list.append(matutils.sparse2full(vec, length=1013243))

    mean = np.mean(dense_vectors_list, axis=0)

    return matutils.unitvec(matutils.full2sparse(mean))
Esempio n. 9
0
    def value_for_text(self, t, rp=default_rp):
        space = rp.lsa_space()
        num_topics = space.num_topics

        tokens = rp.tokens(t)
        tokens = [[token.lower() for token in sentence] for sentence in tokens]

        if len(tokens) < 2:
            return 0

        spans = np.zeros(len(tokens) - 1)
        for i in range(1, len(tokens)):
            past_sentences = tokens[:i]
            span_dim = len(past_sentences)

            if span_dim > num_topics - 1:
                # It's not clear, from the papers I read, what should be done
                # in this case. I did what seemed to not imply in loosing
                # information.
                beginning = past_sentences[0:span_dim - num_topics]
                past_sentences[0] = list(chain.from_iterable(beginning))

            past_vectors = [
                sparse2full(space.get_vector(sent), num_topics)
                for sent in past_sentences
            ]

            curr_vector = sparse2full(space.get_vector(tokens[i]), num_topics)
            curr_array = np.array(curr_vector).reshape(num_topics, 1)

            A = np.array(past_vectors).transpose()

            projection_matrix = dot(dot(A, pinv(dot(A.transpose(), A))),
                                    A.transpose())

            projection = dot(projection_matrix, curr_array).ravel()

            spans[i - 1] = cossim(full2sparse(curr_vector),
                                  full2sparse(projection))

        return self.get_value(spans)
Esempio n. 10
0
    def value_for_text(self, t, rp=default_rp):
        space = rp.lsa_space()
        num_topics = space.num_topics

        tokens = rp.tokens(t)
        tokens = [[token.lower() for token in sentence] for sentence in tokens]

        if len(tokens) < 2:
            return 0

        spans = np.zeros(len(tokens) - 1)
        for i in range(1, len(tokens)):
            past_sentences = tokens[:i]
            span_dim = len(past_sentences)

            if span_dim > num_topics - 1:
                # It's not clear, from the papers I read, what should be done
                # in this case. I did what seemed to not imply in loosing
                # information.
                beginning = past_sentences[0:span_dim - num_topics]
                past_sentences[0] = list(chain.from_iterable(beginning))

            past_vectors = [sparse2full(space.get_vector(sent), num_topics)
                            for sent in past_sentences]

            curr_vector = sparse2full(space.get_vector(tokens[i]), num_topics)
            curr_array = np.array(curr_vector).reshape(num_topics, 1)

            A = np.array(past_vectors).transpose()

            projection_matrix = dot(dot(A,
                                        pinv(dot(A.transpose(),
                                                 A))),
                                    A.transpose())

            projection = dot(projection_matrix, curr_array).ravel()

            spans[i - 1] = cossim(full2sparse(curr_vector),
                                  full2sparse(projection))

        return self.get_value(spans)
Esempio n. 11
0
    def __init__(self,
                 corpus,
                 num_features=None,
                 num_terms=None,
                 num_docs=None,
                 num_nnz=None,
                 num_best=None,
                 chunksize=500,
                 dtype=numpy.float32,
                 maintain_sparsity=False):
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize
        self.maintain_sparsity = maintain_sparsity

        if corpus is not None:
            logger.info("creating sparse index")

            # iterate over input corpus, populating the sparse index matrix
            try:
                # use the more efficient corpus generation version, if the input
                # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes).
                num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz
                logger.debug("using efficient sparse index creation")
            except AttributeError:
                # no MmCorpus, use the slower version (or maybe user supplied the
                # num_* params in constructor)
                pass
            if num_features is not None:
                # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity
                num_terms = num_features
            if num_terms is None:
                raise ValueError(
                    "refusing to guess the number of sparse features: specify num_features explicitly"
                )
            corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else
                      (matutils.full2sparse(v) if isinstance(v, numpy.ndarray)
                       else matutils.unitvec(v)) for v in corpus)
            self.index = matutils.corpus2csc(corpus,
                                             num_terms=num_terms,
                                             num_docs=num_docs,
                                             num_nnz=num_nnz,
                                             dtype=dtype,
                                             printprogress=10000).T

            # convert to Compressed Sparse Row for efficient row slicing and multiplications
            self.index = self.index.tocsr(
            )  # currently no-op, CSC.T is already CSR
            logger.info("created %r", self.index)
Esempio n. 12
0
def sparse2matrix(inpath1, inpath2, topics_num, file_name) :
    destpath = '/data/mallet_tests/hellinger/tmp_matrice_'+topics_num+'_'+file_name

    with open(inpath1, 'r') as comparator :
        with io.open(inpath2, 'r') as comparable :
            i = 0
            
            for line_tor in comparator :
                print line_tor.split()[:2]
                l_tor = line_tor.split()[2:]
                l_tor = tuple( (tuple (map (int, (i.split(':')))) for i in l_tor))
#                    print l_tor
                len_tor = int(topics_num.split('x')[0])
                mat_tor = mat.sparse2full(doc=l_tor,length=len_tor)
#                print mat_tor.size             

#                for line_ble in comparable :
                line_ble = comparable.readline()
                print line_ble.split()[:2]
                l_ble = line_ble.split()[2:]
                l_ble = tuple( (tuple (map (int, (i.split(':')))) for i in l_ble))
#                    print l_ble
                len_ble = int(topics_num.split('x')[1])
                mat_ble = mat.sparse2full(doc=l_ble,length=len_ble)
#                    print mat_ble.size
#                    sys.exit()

                matrix = n.zeros(shape=(len_ble,len_tor))
#                    print matrix
#                    sys.exit()
                for k in xrange(len_tor) :
#                        print 'ollaan koossa'
                    for j in xrange(len_ble) :
#                            print 'ollaan jiissa'
#                            matrix[j][k] = k*j
#                            print matrix
#                    sys.exit()    
                        matrix[j][k] = pow(abs((math.sqrt(mat_tor[k]) - math.sqrt(mat_ble[j]))),2)
                        print matrix[j][k]
                sys.exit()
                with open(destpath+'_'+line_tor.split()[1]+'.txt', 'w') as matrixfile :
                    matrixfile.write(str(mat.full2sparse(matrix)))
                matrixfile.closed
                print 'word %s done' %  line_ble.split()[:2]
                i += 1 
        comparator.closed
    comparable.closed
    
    print 'matrixes done'
Esempio n. 13
0
 def __get_centroid(self, cluster):
     #averages all docs in cluster
     count = 0
     centroid = numpy.zeros(self.num_features, dtype=numpy.float32)
     for doc_id in cluster:
         doc = self.similarity_index.vector_by_id(doc_id).toarray().flatten()
         #full_doc = matutils.sparse2full(doc, self.num_features)
         
         centroid = centroid + doc
         count += 1
         
     if count != 0:
         centroid = centroid / count
         
     return matutils.full2sparse(centroid)
Esempio n. 14
0
    def __getitem__(self, bow, scaled=False, chunksize=256):
        """
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.

        This is done by folding input document into the latent topic space.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform 256 documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # 256 smaller mat * vec multiplications, better use of cache).
            return self._apply(bow, chunksize=chunksize)

        if is_corpus:
            vec = numpy.vstack(matutils.sparse2full(doc, self.num_terms).astype(self.projection.u.dtype) for doc in bow).T
        else:
            vec = matutils.sparse2full(bow, self.num_terms).astype(self.projection.u.dtype)

        assert self.projection.u is not None, "decomposition not initialized yet"
        # automatically convert U to memory order suitable for column slicing
        # this will ideally be done only once, at the very first lsi[query] transformation
        self.projection.u = asfarray(self.projection.u)
        topic_dist = numpy.dot(self.projection.u[:, :self.num_topics].T, vec) # u^-1 * x
        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x

        # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist)
        else:
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
Esempio n. 15
0
    def __getitem__(self, bow, scaled=False, chunksize=512):
        """
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.

        This is done by folding input document into the latent topic space.
        """
        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # `chunksize` smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]
        vec = matutils.corpus2csc(bow, num_terms=self.num_terms)

        topic_dist = (
            vec.T *
            self.projection.u[:, :self.num_topics]).T  # (x^T * u).T = u^-1 * x
        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]
                          ) * topic_dist  # s^-1 * u^-1 * x

        # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist.flat)
        else:
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
Esempio n. 16
0
    def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None,
                 num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False):
        """

        Parameters
        ----------
        corpus: iterable of list of (int, float)
            A list of documents in the BoW format.
        num_features : int, optional
            Size of the dictionary. Must be either specified, or present in `corpus.num_terms`.
        num_terms : int, optional
            Alias for `num_features`, you can use either.
        num_docs : int, optional
            Number of documents in `corpus`. Will be calculated if not provided.
        num_nnz : int, optional
            Number of non-zero elements in `corpus`. Will be calculated if not provided.
        num_best : int, optional
            If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
            Otherwise, return a full vector with one float for every document in the index.
        chunksize : int, optional
            Size of query chunks. Used internally when the query is an entire corpus.
        dtype : numpy.dtype, optional
            Data type of the internal matrix.
        maintain_sparsity : bool, optional
            Return sparse arrays from :meth:`~gensim.similarities.docsim.SparseMatrixSimilarity.get_similarities`?

        """
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize
        self.maintain_sparsity = maintain_sparsity

        if corpus is not None:
            logger.info("creating sparse index")

            # iterate over input corpus, populating the sparse index matrix
            try:
                # use the more efficient corpus generation version, if the input
                # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes).
                num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz
                logger.debug("using efficient sparse index creation")
            except AttributeError:
                # no MmCorpus, use the slower version (or maybe user supplied the
                # num_* params in constructor)
                pass
            if num_features is not None:
                # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity
                num_terms = num_features
            if num_terms is None:
                raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly")
            corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else
                      (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else
                       matutils.unitvec(v)) for v in corpus)
            self.index = matutils.corpus2csc(
                corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz,
                dtype=dtype, printprogress=10000
            ).T

            # convert to Compressed Sparse Row for efficient row slicing and multiplications
            self.index = self.index.tocsr()  # currently no-op, CSC.T is already CSR
            logger.info("created %r", self.index)
Esempio n. 17
0
 def get_similar(self, doc, topn=10):
     m = np.asarray([matutils.sparse2full(doc, len(self.dictionary))])
     bv = artm.BatchVectorizer(data_format='bow_n_wd', n_wd=m.T, vocabulary=self.dictionary)
     sims = self.index[matutils.full2sparse(self.model.transform(bv))]
     return [t[0] for t in sims[:topn]]
Esempio n. 18
0
    def __init__(self,
                 corpus,
                 num_features=None,
                 num_terms=None,
                 num_docs=None,
                 num_nnz=None,
                 num_best=None,
                 chunksize=500,
                 dtype=numpy.float32,
                 maintain_sparsity=False):
        """

        Parameters
        ----------
        corpus: iterable of list of (int, float)
            A list of documents in the BoW format.
        num_features : int, optional
            Size of the dictionary. Must be either specified, or present in `corpus.num_terms`.
        num_terms : int, optional
            Alias for `num_features`, you can use either.
        num_docs : int, optional
            Number of documents in `corpus`. Will be calculated if not provided.
        num_nnz : int, optional
            Number of non-zero elements in `corpus`. Will be calculated if not provided.
        num_best : int, optional
            If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
            Otherwise, return a full vector with one float for every document in the index.
        chunksize : int, optional
            Size of query chunks. Used internally when the query is an entire corpus.
        dtype : numpy.dtype, optional
            Data type of the internal matrix.
        maintain_sparsity : bool, optional
            Return sparse arrays from :meth:`~gensim.similarities.docsim.SparseMatrixSimilarity.get_similarities`?

        """
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize
        self.maintain_sparsity = maintain_sparsity

        if corpus is not None:
            logger.info("creating sparse index")

            # iterate over input corpus, populating the sparse index matrix
            try:
                # use the more efficient corpus generation version, if the input
                # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes).
                num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz
                logger.debug("using efficient sparse index creation")
            except AttributeError:
                # no MmCorpus, use the slower version (or maybe user supplied the
                # num_* params in constructor)
                pass
            if num_features is not None:
                # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity
                num_terms = num_features
            if num_terms is None:
                raise ValueError(
                    "refusing to guess the number of sparse features: specify num_features explicitly"
                )
            corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else
                      (matutils.full2sparse(v) if isinstance(v, numpy.ndarray)
                       else matutils.unitvec(v)) for v in corpus)
            self.index = matutils.corpus2csc(corpus,
                                             num_terms=num_terms,
                                             num_docs=num_docs,
                                             num_nnz=num_nnz,
                                             dtype=dtype,
                                             printprogress=10000).T

            # convert to Compressed Sparse Row for efficient row slicing and multiplications
            self.index = self.index.tocsr(
            )  # currently no-op, CSC.T is already CSR
            logger.info("created %r", self.index)
Esempio n. 19
0
def ndarray2gensim(array):
    """Convert a numpy ndarray into a gensim-style generator of lists of
    tuples."""
    return (full2sparse(row) for row in array)
Esempio n. 20
0
 def __getitem__(self, item):
     return full2sparse(np.random.randn(1, self.dims))
Esempio n. 21
0
 def __getitem__(self, item):
     return full2sparse(np.random.randn(1, self.dims))