def test_apply(self):

        transformed_vtcorp = self.transformer._apply(self.vtcorp)

        self.assertTrue(hasattr(transformed_vtcorp.corpus, 'dictionary'))

        transformed_names = self.loader.layout.required_text_corpus_names(self.transformation_label)
        text_data_name = os.path.join(self.data_root,
                                      self.loader.layout.corpus_dir,
                                      transformed_names[0])
        text_obj_name = os.path.join(self.data_root,
                                      self.loader.layout.corpus_dir,
                                      transformed_names[2])

        MmCorpus.serialize(text_data_name, transformed_vtcorp)
        transformed_vtcorp.save(text_obj_name)

        self.assertTrue(self.loader.has_text_corpora(self.transformation_label))

        self.temporary_files.extend([ os.path.join(self.data_root,
                                                   self.loader.layout.corpus_dir,
                                                   transformed_name)
                                      for transformed_name in transformed_names])

        transformed_vtcorp = TransformedCorpus.load(text_obj_name)

        self.assertIsInstance(transformed_vtcorp, TransformedCorpus)
        self.assertIsInstance(transformed_vtcorp.corpus, VTextCorpus)
        self.assertTrue(hasattr(transformed_vtcorp.corpus, 'dictionary'))

        print 'Transformed corpus dictionary size: %i' % len(transformed_vtcorp.corpus.dictionary)
        self.assertEqual(self.k, len(transformed_vtcorp.obj.orig2transformed))
Example #2
0
    def test_indexing(self):
        fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
        corpus = self.corpus_class(fname)
        docs = list(corpus)

        for idx, doc in enumerate(docs):
            self.assertEqual(doc, corpus[idx])
            self.assertEqual(doc, corpus[np.int64(idx)])

        self.assertEqual(docs, list(corpus[:]))
        self.assertEqual(docs[0:], list(corpus[0:]))
        self.assertEqual(docs[0:-1], list(corpus[0:-1]))
        self.assertEqual(docs[2:4], list(corpus[2:4]))
        self.assertEqual(docs[::2], list(corpus[::2]))
        self.assertEqual(docs[::-1], list(corpus[::-1]))

        # make sure sliced corpora can be iterated over multiple times
        c = corpus[:]
        self.assertEqual(docs, list(c))
        self.assertEqual(docs, list(c))
        self.assertEqual(len(docs), len(corpus))
        self.assertEqual(len(docs), len(corpus[:]))
        self.assertEqual(len(docs[::2]), len(corpus[::2]))

        def _get_slice(corpus, slice_):
            # assertRaises for python 2.6 takes a callable
            return corpus[slice_]

        # make sure proper input validation for sliced corpora is done
        self.assertRaises(ValueError, _get_slice, corpus, {1})
        self.assertRaises(ValueError, _get_slice, corpus, 1.0)

        # check sliced corpora that use fancy indexing
        c = corpus[[1, 3, 4]]
        self.assertEqual([d for i, d in enumerate(docs) if i in [1, 3, 4]],
                         list(c))
        self.assertEqual([d for i, d in enumerate(docs) if i in [1, 3, 4]],
                         list(c))
        self.assertEqual(len(corpus[[0, 1, -1]]), 3)
        self.assertEqual(len(corpus[np.asarray([0, 1, -1])]), 3)

        # check that TransformedCorpus supports indexing when the underlying
        # corpus does, and throws an error otherwise
        corpus_ = TransformedCorpus(DummyTransformer(), corpus)
        if hasattr(corpus, 'index') and corpus.index is not None:
            self.assertEqual(corpus_[0][0][1], docs[0][0][1] + 1)
            self.assertRaises(ValueError, _get_slice, corpus_, {1})
            transformed_docs = [
                val + 1 for i, d in enumerate(docs) for _, val in d
                if i in [1, 3, 4]
            ]
            self.assertEqual(
                transformed_docs,
                list(v for doc in corpus_[[1, 3, 4]] for _, v in doc))
            self.assertEqual(3, len(corpus_[[1, 3, 4]]))
        else:
            self.assertRaises(RuntimeError, _get_slice, corpus_, [1, 3, 4])
            self.assertRaises(RuntimeError, _get_slice, corpus_, {1})
            self.assertRaises(RuntimeError, _get_slice, corpus_, 1.0)
    def _apply(self, corpus, chunksize=None):
        """Apply transformation in :func:`__getitem__` to the entire corpus.
        Does this by returning gensim's :class:`TransformedCorpus` object that
        applies the transformation over the entire corpus. This is essentially
        a generalization of gensim's VocabTransform class with added facilities
        for backward feature mapping.

        :type corpus: gensim.interfaces.CorpusABC
        :param corpus: The corpus to transform.
        """
        if not isinstance(corpus, TextCorpus):
            logging.warn('Frequency-based transformer applied on non-text' +
                         ' corpus; returning TransformedCorpus.')

            transformed_corpus = TransformedCorpus(self, corpus, chunksize)
            return transformed_corpus

        transformed_corpus = TransformedCorpus(self, corpus, chunksize)
        return transformed_corpus

        # Text corpora: do deep copy, filter transform dictionary

        # Potentially expensive if called on a corpus that stores a lot of
        # information.
        transformed_corpus = deepcopy(corpus)

        # Apply dictionary transformations
        if hasattr(transformed_corpus, 'dictionary'):
            print 'Compactifying dictionary...'
            transformed_corpus.dictionary.filter_tokens(good_ids=list(self.allowed_features))
            transformed_corpus.dictionary.compactify()
            print 'After compactification: %i features.' % len(transformed_corpus.dictionary)
            if hasattr(transformed_corpus, 'allow_dict_updates'):
                transformed_corpus.allow_dict_updates = False

        if hasattr(corpus, 'label'):
            if corpus.label:
                transformed_corpus.label = corpus.label + self.label
            else:
                transformed_corpus.label = self.label

        logging.info('Transformed corpus dictonary has %i features.' % len(transformed_corpus.dictionary))

        return transformed_corpus
Example #4
0
    def _apply(self, corpus, chunksize=None, **kwargs):
        """Apply the transformation to a whole corpus and get the result as another corpus.

        Parameters
        ----------
        corpus : iterable of list of (int, number)
            Corpus in sparse Gensim bag-of-words format.
        chunksize : int, optional
            If provided, a more effective processing will performed.

        Returns
        -------
        :class:`~gensim.interfaces.TransformedCorpus`
            Transformed corpus.

        """
        return TransformedCorpus(self, corpus, chunksize, **kwargs)
Example #5
0
    def _apply(self, corpus, chunksize=None, **kwargs):
        """Apply the transformation to a whole corpus and get the result as another corpus.

        Parameters
        ----------
        corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents)
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).
        chunksize : int, optional
            If provided, a more effective processing will performed.

        Returns
        -------
        :class:`~gensim.interfaces.TransformedCorpus`
            Transformed corpus.

        """
        return TransformedCorpus(self, corpus, chunksize, **kwargs)
Example #6
0
 def load_tfidf_corpus(self):
     return TransformedCorpus.load(self.tfidf_corpus)