Example #1
0
    def test_simple_lists_of_tuples(self):
        # test list words

        # one document, one word
        potentialCorpus = [[(0, 4.)]]
        result = utils.is_corpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        # one document, several words
        potentialCorpus = [[(0, 4.), (1, 2.)]]
        result = utils.is_corpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        potentialCorpus = [[(0, 4.), (1, 2.), (2, 5.), (3, 8.)]]
        result = utils.is_corpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        # several documents, one word
        potentialCorpus = [[(0, 4.)], [(1, 2.)]]
        result = utils.is_corpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        potentialCorpus = [[(0, 4.)], [(1, 2.)], [(2, 5.)], [(3, 8.)]]
        result = utils.is_corpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)
    def __getitem__(self, bow, eps=1e-12):
        """
        Return esa representation of the input vector and/or corpus.

        bow should already be weights, e.g. with TF-IDF
        """
        # if the input vector is in fact a corpus, return a transformed corpus
        # as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # use similarity index to calculate similarity with each vector of corpus
        vector = self.similarity_index[bow]

        # consine similarity is in [-1, 1] shift and scale to make it [0, 1]
        vector += 1
        vector /= 2

        #normalize
        vector = matutils.unitvec(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(concept_id, weight)
                  for concept_id, weight
                  in enumerate(vector)
                  if abs(weight) > eps]
        return vector
Example #3
0
    def __getitem__(self, bow, eps=1e-12):
        """
        Return esa representation of the input vector and/or corpus.
        
        bow should already be weights, e.g. with TF-IDF
        """
        # if the input vector is in fact a corpus, return a transformed corpus 
        # as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        #use corpus as interpreter matrix
        #simply multiply feature vector of input with corpus matrix
        #to get the weight of the concept
        vector = numpy.dot(matutils.sparse2full(bow, self.num_features), self.corpus)

        #normalize
        vector = matutils.unitvec(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(concept_id, weight)
                  for concept_id, weight
                  in enumerate(vector)
                  if abs(weight) > eps]
        return vector
Example #4
0
    def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False):
        """
        Return topic distribution for the given document `bow`, as a list of
        (topic_id, topic_probability) 2-tuples.

        Ignore topics with very low probability (below `minimum_probability`).

        If per_word_topics is True, it also returns a list of topics, sorted in descending order of most likely topics for that word.
        It also returns a list of word_ids and each words corresponding topics' phi_values, multiplied by feature length (i.e, word count)

        """
        if minimum_probability is None:
            minimum_probability = self.minimum_probability
        minimum_probability = max(minimum_probability, 1e-8)  # never allow zero values in sparse output

        if minimum_phi_value is None:
            minimum_phi_value = self.minimum_probability
        minimum_phi_value = max(minimum_phi_value, 1e-8)  # never allow zero values in sparse output

        # if the input vector is a corpus, return a transformed corpus
        is_corpus, corpus = utils.is_corpus(bow)
        if is_corpus:
            kwargs = dict(
                per_word_topics=per_word_topics,
                minimum_probability=minimum_probability,
                minimum_phi_value=minimum_phi_value
            )
            return self._apply(corpus, **kwargs)

        gamma, phis = self.inference([bow], collect_sstats=per_word_topics)
        topic_dist = gamma[0] / sum(gamma[0])  # normalize distribution

        document_topics = [
            (topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)
            if topicvalue >= minimum_probability
        ]

        if not per_word_topics:
            return document_topics
        else:
            word_topic = []  # contains word and corresponding topic
            word_phi = []  # contains word and phi values
            for word_type, weight in bow:
                phi_values = []  # contains (phi_value, topic) pairing to later be sorted
                phi_topic = []  # contains topic and corresponding phi value to be returned 'raw' to user
                for topic_id in range(0, self.num_topics):
                    if phis[topic_id][word_type] >= minimum_phi_value:
                        # appends phi values for each topic for that word
                        # these phi values are scaled by feature length
                        phi_values.append((phis[topic_id][word_type], topic_id))
                        phi_topic.append((topic_id, phis[topic_id][word_type]))

                # list with ({word_id => [(topic_0, phi_value), (topic_1, phi_value) ...]).
                word_phi.append((word_type, phi_topic))
                # sorts the topics based on most likely topic
                # returns a list like ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]).
                sorted_phi_values = sorted(phi_values, reverse=True)
                topics_sorted = [x[1] for x in sorted_phi_values]
                word_topic.append((word_type, topics_sorted))
            return (document_topics, word_topic, word_phi)  # returns 2-tuple
Example #5
0
    def serialize(filename_prefix, layer, current_representation, num_terms=None, chunksize=10000):
        is_corpus, current_representation = utils.is_corpus(current_representation)
        if is_corpus:
            for chunk_no, chunk in enumerate(utils.grouper(current_representation, chunksize)):
                ln.debug("preparing chunk for conversion (%s documents)..." % len(chunk))
                assert num_terms is not None, "Need num_terms to properly handle sparse corpus format"
                chunk_as_csc = matutils.corpus2csc(chunk, num_terms=num_terms)

                ln.debug("Chunk converted to csc, running through layer..")
                chunk_trans = layer.__getitem__(chunk_as_csc)

                ln.debug("Serializing hidden representation..")
                fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no))
                np.save(fname, chunk_trans)
                ln.debug("Finished serializing chunk. Processed %s documents so far." %
                         (chunk_no * chunksize + len(chunk)))
        else:
            ln.info("Beginning serialization of non-gensim corpus format intermediate representation.")
            ln.debug("Type of current_representation is %s" % type(current_representation))
            for chunk_no, chunk in enumerate(current_representation):
                ln.debug("converting chunk (%s documents)..." % chunksize)
                chunk_trans = layer.__getitem__(chunk)
                ln.debug("Serializing hidden representation..")
                fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no))
                np.save(fname, chunk_trans)
                ln.debug("finished serializing chunk.")

        ln.info("Finished serializing all chunks.")
Example #6
0
    def __getitem__(self, bow, chunksize=10000):
        #ln.debug("getitem: %s" % chunksize)
        is_corpus, bow = utils.is_corpus(bow)
        if not is_corpus:
            bow = [bow]

        ln.info("Computing hidden representation for %s documents..." % len(bow))

        if not chunksize:  # todo I think could be removed altogether
            chunksize = 1

        def transformed_corpus():
            for chunk_no, doc_chunk in enumerate(utils.grouper(bow, chunksize)):
                ln.debug("Converting chunk %s to csc format.." % chunk_no)
                chunk = matutils.corpus2csc(doc_chunk, self.input_dimensionality)
                ln.debug("Computing hidden representation for chunk.. ")
                hidden = self._get_hidden_representations(chunk)
                ln.info("Finished computing representation for chunk %s, yielding results. Total docs processed: %s" %
                        (chunk_no, chunk_no * chunksize + len(doc_chunk)))
                for column in hidden.T:
                    yield matutils.dense2vec(column.T)
                ln.debug("Done yielding chunk %s" % chunk_no)

            ln.info("Finished computing representations for all chunks.")

        if not is_corpus:
            res = list(transformed_corpus()).pop()
            return res
        else:
            return transformed_corpus()
Example #7
0
    def __getitem__(self, bow, eps=1e-12):
        """
        Return tf-idf representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)
        vector = [
            (termid, self.wlocal(tf) * self.idfs.get(termid)) for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0
        ]

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        if self.normalize is True:
            vector = matutils.unitvec(vector)
        elif self.normalize:
            vector = self.normalize(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
        return vector
Example #8
0
    def __getitem__(self, bow, scaled=False, chunksize=512):
        """
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.

        This is done by folding input document into the latent topic space.
        """
        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform 256 documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # 256 smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]
        vec = matutils.corpus2csc(bow, num_terms=self.num_terms)

        topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x
        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x

        # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist.flat)
        else:
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
Example #9
0
    def get_similarities(self, query):
        """
        Return similarity of sparse vector `query` to all documents in the corpus,
        as a numpy array.

        If `query` is a collection of documents, return a 2D array of similarities
        of each document in `query` to all documents in the corpus (=batch query,
        faster than processing each document in turn).

        **Do not use this function directly; use the self[query] syntax instead.**

        """
        is_corpus, query = utils.is_corpus(query)
        if is_corpus:
            query = matutils.corpus2csc(query, self.index.shape[1], dtype=self.index.dtype)
        else:
            if scipy.sparse.issparse(query):
                query = query.T # convert documents=rows to documents=columns
            elif isinstance(query, numpy.ndarray):
                if query.ndim == 1:
                    query.shape = (1, len(query))
                query = scipy.sparse.csr_matrix(query, dtype=self.index.dtype).T
            else:
                # default case: query is a single vector, in sparse gensim format
                query = matutils.corpus2csc([query], self.index.shape[1], dtype=self.index.dtype)

        # compute cosine similarity against every other document in the collection
        result = self.index * query.tocsc() # N x T * T x C = N x C
        if result.shape[1] == 1:
            # for queries of one document, return a 1d array
            result = result.toarray().flatten()
        else:
            # otherwise, return a 2d matrix (#queries x #index)
            result = result.toarray().T
        return result
Example #10
0
    def __getitem__(self, bow, iterations=100):
        """Get vector for document(s).

        Parameters
        ----------
        bow : {list of (int, int), iterable of list of (int, int)}
            Document (or corpus) in BoW format.
        iterations : int, optional
            Number of iterations that will be used for inferring.

        Returns
        -------
        list of (int, float)
            LDA vector for document as sequence of (topic_id, topic_probability) **OR**
        list of list of (int, float)
            LDA vectors for corpus in same format.

        """
        is_corpus, corpus = utils.is_corpus(bow)
        if not is_corpus:
            # query is a single document => make a corpus out of it
            bow = [bow]

        self.convert_input(bow, infer=True)
        cmd = \
            self.mallet_path + ' infer-topics --input %s --inferencer %s ' \
                               '--output-doc-topics %s --num-iterations %s --doc-topics-threshold %s'
        cmd = cmd % (
            self.fcorpusmallet() + '.infer', self.finferencer(),
            self.fdoctopics() + '.infer', iterations, self.topic_threshold
        )
        logger.info("inferring topics with MALLET LDA '%s'", cmd)
        check_output(args=cmd, shell=True)
        result = list(self.read_doctopics(self.fdoctopics() + '.infer'))
        return result if is_corpus else result[0]
Example #11
0
    def __getitem__(self, query):
        """Get similarities of document `query` to all documents in the corpus.

        **or**

        If `query` is a corpus (iterable of documents), return a matrix of similarities
        of all query documents vs. all corpus document. This batch query is more
        efficient than computing the similarities one document after another.
        """
        self.close_shard() # no-op if no documents added to index since last query

        results = []
        for shard in self.shards:
            shard.num_best = self.num_best
            shard.normalize = self.normalize
            results.append(shard[query])

        if self.num_best is None:
            return numpy.hstack(results)

        # only top-n most similars requested; merge the partial results from all shards
        is_corpus, results = utils.is_corpus(results)
        if is_corpus:
            # query = single document?
            result = sorted(sum(results, []), key=lambda item: -item[1])[ : self.num_best]
        else:
            result = []
            for parts in itertools.izip(*results):
                merged = sorted(sum(parts, []), key=lambda item: -item[1])[ : self.num_best]
                result.append(merged)
        return result
Example #12
0
    def top_topics(self, corpus, num_words=20):
        """
        Calculate the Umass topic coherence for each topic. Algorithm from
        **Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.**
        """
        is_corpus, corpus = utils.is_corpus(corpus)
        if not is_corpus:
            logger.warning("LdaModel.top_topics() called with an empty corpus")
            return

        topics = []
        str_topics = []
        for topic in self.state.get_lambda():
            topic = topic / topic.sum()  # normalize to probability distribution
            bestn = matutils.argsort(topic, topn=num_words, reverse=True)
            topics.append(bestn)
            beststr = [(topic[id], self.id2word[id]) for id in bestn]
            str_topics.append(beststr)

        # top_ids are limited to every topics top words. should not exceed the
        # vocabulary size.
        top_ids = set(chain.from_iterable(topics))

        # create a document occurence sparse matrix for each word
        doc_word_list = {}
        for id in top_ids:
            id_list = set()
            for n, document in enumerate(corpus):
                if id in frozenset(x[0] for x in document):
                    id_list.add(n)

            doc_word_list[id] = id_list

        coherence_scores = []
        for t, top_words in enumerate(topics):
            # Calculate each coherence score C(t, top_words)
            coherence = 0.0
            # Sum of top words m=2..M
            for m in top_words[1:]:
                # m_docs is v_m^(t)
                m_docs = doc_word_list[m]

                # Sum of top words l=1..m-1
                # i.e., all words ranked higher than the current word m
                for l in top_words[:m - 1]:
                    # l_docs is v_l^(t)
                    l_docs = doc_word_list[l]

                    # make sure this word appears in some documents.
                    if len(l_docs) > 0:
                        # co_doc_frequency is D(v_m^(t), v_l^(t))
                        co_doc_frequency = len(m_docs.intersection(l_docs))

                        # add to the coherence sum for these two words m, l
                        coherence += numpy.log((co_doc_frequency + 1.0) / len(l_docs))

            coherence_scores.append((str_topics[t], coherence))

        top_topics = sorted(coherence_scores, key=lambda t: t[1], reverse=True)
        return top_topics
Example #13
0
    def get_similarities(self, query):
        """
        Return similarity of sparse vector `query` to all documents in the corpus,
        as a numpy array.

        If `query` is a collection of documents, return a 2D array of similarities
        of each document in `query` to all documents in the corpus (=batch query,
        faster than processing each document in turn).

        **Do not use this function directly; use the self[query] syntax instead.**

        """
        is_corpus, query = utils.is_corpus(query)
        if is_corpus:
            query = numpy.asarray(
                [matutils.sparse2full(vec, self.num_features) for vec in query],
                dtype=self.index.dtype)
        else:
            if scipy.sparse.issparse(query):
                query = query.toarray()  # convert sparse to dense
            elif isinstance(query, numpy.ndarray):
                pass
            else:
                # default case: query is a single vector in sparse gensim format
                query = matutils.sparse2full(query, self.num_features)
            query = numpy.asarray(query, dtype=self.index.dtype)

        # do a little transposition dance to stop numpy from making a copy of
        # self.index internally in numpy.dot (very slow).
        result = numpy.dot(self.index, query.T).T  # return #queries x #index
        return result  # XXX: removed casting the result from array to list; does anyone care?
Example #14
0
    def get_similarities(self, query):
        """Get similarity between `query` and this index.

        Warnings
        --------
        Do not use this function directly; use the `self[query]` syntax instead.

        Parameters
        ----------
        query : {list of (int, number), iterable of list of (int, number)}
            Document or collection of documents.

        Return
        ------
        :class:`numpy.ndarray`
            Similarity matrix.

        """
        if not self.corpus:
            return numpy.array()

        is_corpus, query = utils.is_corpus(query)
        if not is_corpus and isinstance(query, numpy.ndarray):
            query = [self.corpus[i] for i in query]  # convert document indexes to actual documents
        result = self.similarity_matrix.inner_product(query, self.corpus, normalized=True)

        if scipy.sparse.issparse(result):
            return numpy.asarray(result.todense())
        if numpy.isscalar(result):
            return numpy.array(result)
        return numpy.asarray(result)[0]
    def __getitem__(self, bow):
        """Get log entropy representation of the input vector and/or corpus.

        Parameters
        ----------
        bow : list of (int, int)
            Document in BoW format.

        Returns
        -------
        list of (int, float)
            Log-entropy vector for passed `bow`.

        """
        # if the input vector is in fact a corpus, return a transformed corpus
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge)
        vector = [
            (term_id, math.log(tf + 1) * self.entr.get(term_id))
            for term_id, tf in bow
            if term_id in self.entr
        ]
        if self.normalize:
            vector = matutils.unitvec(vector)
        return vector
Example #16
0
    def __getitem__(self, bow, eps=0.01):
        """Convert document or corpus in BoW format to LDA vectors in BoW format

        Parameters
        ----------
        bow : {list of (int, int), iterable of list of (int, int)}
            Document or corpus in BoW format.
        eps : float
            Threshold value (all topics with probability < `eps` will be ignored.

        Returns
        -------
        list of (int, float)
            LDA vector for document **OR**
        list of list of (int, float)
            LDA vectors for corpus.

        """
        is_corpus, dummy_corpus = utils.is_corpus(bow)
        if not is_corpus:
            bow = [bow]

        predictions = self._predict(bow)[0]

        topics = []
        for row in predictions:
            row_topics = []
            for topic_id, val in enumerate(row):
                if val > eps:
                    row_topics.append((topic_id, val))
            topics.append(row_topics)

        return topics if is_corpus else topics[0]
Example #17
0
    def _getbow(self, doc):
        # if doc is an iterable apply to all
        is_corpus, doc = utils.is_corpus(doc)
        if is_corpus:
            return SimpleCorpus(self._apply(doc))

        return self.dict.doc2bow(doc, allow_update=True)
Example #18
0
    def __getitem__(self, doc):
        # if doc is an iterable apply to all
        is_corpus, doc = utils.is_corpus(doc)
        if is_corpus:
            return self._apply(doc)

        # return transformed doc according to function
        return self.funct(doc, *self.fargs, **self.fkwargs)
Example #19
0
    def __getitem__(self, bow, eps=0.01):
        is_corpus, corpus = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(corpus)

        gamma = self.inference([bow])[0]
        topic_dist = gamma / sum(gamma) if sum(gamma) != 0 else []
        return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= eps]
Example #20
0
    def __getitem__(self, bow, eps=0.01):
        is_corpus, corpus = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(corpus)

        gamma, _ = self.inference([bow])
        topic_dist = gamma[0] / sum(gamma[0]) # normalize to proper distribution
        return [topicvalue for topicid, topicvalue in enumerate(topic_dist)]
Example #21
0
    def __getitem__(self, bow, eps=1e-12):
        """Get the tf-idf representation of an input vector and/or corpus.

        bow : {list of (int, int), iterable of iterable of (int, int)}
            Input document in the `sparse Gensim bag-of-words format
            <https://radimrehurek.com/gensim/intro.html#core-concepts>`_,
            or a streamed corpus of such documents.
        eps : float
            Threshold value, will remove all position that have tfidf-value less than `eps`.

        Returns
        -------
        vector : list of (int, float)
            TfIdf vector, if `bow` is a single document
        :class:`~gensim.interfaces.TransformedCorpus`
            TfIdf corpus, if `bow` is a corpus.

        """
        self.eps = eps
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)

        termid_array, tf_array = [], []
        for termid, tf in bow:
            termid_array.append(termid)
            tf_array.append(tf)

        tf_array = self.wlocal(np.array(tf_array))

        vector = [
            (termid, tf * self.idfs.get(termid))
            for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > self.eps
        ]

        if self.normalize is True:
            self.normalize = matutils.unitvec
        elif self.normalize is False:
            self.normalize = utils.identity

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        if self.pivot is None:
            norm_vector = self.normalize(vector)
            norm_vector = [(termid, weight) for termid, weight in norm_vector if abs(weight) > self.eps]
        else:
            _, old_norm = self.normalize(vector, return_norm=True)
            pivoted_norm = (1 - self.slope) * self.pivot + self.slope * old_norm
            norm_vector = [
                (termid, weight / float(pivoted_norm))
                for termid, weight in vector
                if abs(weight / float(pivoted_norm)) > self.eps
            ]
        return norm_vector
Example #22
0
    def test_getitem_dense2gensim(self):

        corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100,
                               dim=self.dim, sparse_serialization=False,
                               gensim=True)

        item = corpus[3]
        self.assertTrue(isinstance(item, list))
        self.assertTrue(isinstance(item[0], tuple))

        dslice = corpus[2:6]
        self.assertTrue(next(dslice) == corpus[2])
        dslice = list(dslice)
        self.assertTrue(isinstance(dslice, list))
        self.assertTrue(isinstance(dslice[0], list))
        self.assertTrue(isinstance(dslice[0][0], tuple))

        iscorp, _ = is_corpus(dslice)
        self.assertTrue(iscorp, "Is the object returned by slice notation "
                                "a gensim corpus?")

        ilist = corpus[[2, 3, 4, 5]]
        self.assertTrue(next(ilist) == corpus[2])
        ilist = list(ilist)
        self.assertTrue(isinstance(ilist, list))
        self.assertTrue(isinstance(ilist[0], list))
        self.assertTrue(isinstance(ilist[0][0], tuple))

        # From generators to lists

        self.assertEqual(len(ilist), len(dslice))
        for i in xrange(len(ilist)):
            self.assertEqual(len(ilist[i]), len(dslice[i]),
                             "Row %d: dims %d/%d" % (i, len(ilist[i]),
                                                     len(dslice[i])))
            for j in xrange(len(ilist[i])):
                self.assertEqual(ilist[i][j], dslice[i][j],
                                 "ilist[%d][%d] = %s ,dslice[%d][%d] = %s" % (
                                     i, j, str(ilist[i][j]), i, j,
                                     str(dslice[i][j])))


        iscorp, _ = is_corpus(ilist)
        self.assertTrue(iscorp, "Is the object returned by list notation "
                                "a gensim corpus?")
Example #23
0
    def __getitem__(self, item):

        iscorpus, _ = is_corpus(item)

        if iscorpus or isinstance(item, DatasetABC):
            return self._apply(item)
        else:
            raise ValueError('Cannot apply flatten_composite to individual '
                             'documents.')
Example #24
0
    def __getitem__(self, doc):
        # if doc is an iterable apply to all
        is_corpus, doc = utils.is_corpus(doc)
        if is_corpus:
            return self._apply(doc)

        self.counter.update(doc)

        return doc
Example #25
0
    def __getitem__(self, doc):
        # if doc is an iterable apply to all
        is_corpus, doc = utils.is_corpus(doc)
        if is_corpus:
            return self._apply(doc)

        # appling transformation, return doc as a bag-of-bitokens list
        allow_update = False if len(self.bidict) > 0 else True
        return self.bidict.doc2bob(doc, allow_update)
Example #26
0
    def __getitem__(self, bow, scaled=False, chunksize=512):
        """
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.

        This is done by folding input document into the latent topic space.

        If `scaled` is set, scale topics by the inverse of singular values (default: no scaling).

        """
        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # `chunksize` smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]

        # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication
        vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype)
        topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T  # (x^T * u).T = u^-1 * x

        # # convert input to dense, then do dense * dense multiplication
        # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse),
        # but consumes more memory
        # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow))
        # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec)

        # # use np's advanced indexing to simulate sparse * dense
        # # ± same speed again
        # u = self.projection.u[:, :self.num_topics]
        # topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype)
        # for vecno, vec in enumerate(bow):
        #     indices, data = zip(*vec) if vec else ([], [])
        #     topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype))

        if not is_corpus:
            # convert back from matrix into a 1d vec
            topic_dist = topic_dist.reshape(-1)

        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist  # s^-1 * u^-1 * x

        # convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist)
        else:
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
Example #27
0
    def __getitem__(self, bow):
        """
        Return representation with the ids transformed.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        return sorted((self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new)
Example #28
0
    def __getitem__(self, item):

        iscorpus, _ = is_corpus(item)

        if iscorpus:
            return self._apply(item)
        else:
            #raise ValueError('Cannot apply serializer to individual documents.')
            # Will this work?
            return self.serialized_data[item]
Example #29
0
  def __getitem__(self, vec, eps=1e-12):
    is_corpus, vec = utils.is_corpus(vec)
    if is_corpus:
      return self._apply(vec)

    if self.L1:
      score = sum( v  for _, v in vec) / len(vec) if vec else 0
    else:
      score = sum(v*v for _, v in vec) / len(vec) if vec else 0
    return score
Example #30
0
    def __getitem__(self, bow, iterations=100):
        is_corpus, corpus = utils.is_corpus(bow)
        if not is_corpus:
            bow = [bow]

        self.convert_input(bow, infer=True)
        cmd = self.mallet_path + " infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s"
        cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations)
        logger.info("inferring with Mallet LDA with %s" % cmd)
        call(cmd, shell=True)
        return list(read_doctopics(self.fdoctopics() + '.infer'))