Example #1
0
 def test_not_trained_given_zero_models(self):
     elda = EnsembleLda(corpus=common_corpus,
                        id2word=common_dictionary,
                        num_topics=NUM_TOPICS,
                        passes=PASSES,
                        num_models=0,
                        random_state=RANDOM_STATE)
     assert len(elda.ttda) == 0
Example #2
0
 def get_elda(self):
     return EnsembleLda(
         corpus=common_corpus,
         id2word=common_dictionary,
         num_topics=NUM_TOPICS,
         passes=PASSES,
         num_models=NUM_MODELS,
         random_state=RANDOM_STATE,
         topic_model_class=LdaModel,
     )
Example #3
0
 def test_not_trained_given_zero_iterations(self):
     elda = EnsembleLda(
         corpus=common_corpus,
         id2word=common_dictionary,
         num_topics=NUM_TOPICS,
         iterations=0,
         num_models=NUM_MODELS,
         random_state=RANDOM_STATE,
     )
     assert len(elda.ttda) == 0
Example #4
0
    def test_persisting(self):
        elda = self.get_elda()
        elda_mem_unfriendly = self.get_elda_mem_unfriendly()

        fname = get_tmpfile('gensim_models_ensemblelda')
        elda.save(fname)
        loaded_elda = EnsembleLda.load(fname)
        # storing the ensemble without memory_friendy_ttda
        elda_mem_unfriendly.save(fname)
        loaded_elda_mem_unfriendly = EnsembleLda.load(fname)

        # topic_model_class will be lazy loaded and should be None first
        assert loaded_elda.topic_model_class is None

        # was it stored and loaded correctly?
        # memory friendly.
        loaded_elda_representation = loaded_elda.generate_gensim_representation(
        )

        # generating the representation also lazily loads the topic_model_class
        assert loaded_elda.topic_model_class == LdaModel

        topics = loaded_elda_representation.get_topics()
        ttda = loaded_elda.ttda
        amatrix = loaded_elda.asymmetric_distance_matrix
        np.testing.assert_allclose(elda.get_topics(), topics, rtol=RTOL)
        np.testing.assert_allclose(elda.ttda, ttda, rtol=RTOL)
        np.testing.assert_allclose(elda.asymmetric_distance_matrix,
                                   amatrix,
                                   rtol=RTOL)

        expected_clustering_results = elda.cluster_model.results
        loaded_clustering_results = loaded_elda.cluster_model.results

        self.assert_clustering_results_equal(expected_clustering_results,
                                             loaded_clustering_results)

        # memory unfriendly
        loaded_elda_mem_unfriendly_representation = loaded_elda_mem_unfriendly.generate_gensim_representation(
        )
        topics = loaded_elda_mem_unfriendly_representation.get_topics()
        np.testing.assert_allclose(elda.get_topics(), topics, rtol=RTOL)
Example #5
0
    def test_backwards_compatibility_with_persisted_model(self):
        elda = self.get_elda()

        # compare with a pre-trained reference model
        loaded_elda = EnsembleLda.load(datapath('ensemblelda'))
        np.testing.assert_allclose(elda.ttda, loaded_elda.ttda, rtol=RTOL)
        atol = loaded_elda.asymmetric_distance_matrix.max() * 1e-05
        np.testing.assert_allclose(
            elda.asymmetric_distance_matrix,
            loaded_elda.asymmetric_distance_matrix,
            atol=atol,
        )
Example #6
0
    def test_recluster(self):
        # the following test is quite specific to the current implementation and not part of any api,
        # but it makes improving those sections of the code easier as long as sorted_clusters and the
        # cluster_model results are supposed to stay the same. Potentially this test will deprecate.

        elda = EnsembleLda.load(datapath('ensemblelda'))
        loaded_cluster_model_results = deepcopy(elda.cluster_model.results)
        loaded_valid_clusters = deepcopy(elda.valid_clusters)
        loaded_stable_topics = deepcopy(elda.get_topics())

        # continue training with the distance matrix of the pretrained reference and see if
        # the generated clusters match.
        elda.asymmetric_distance_matrix_outdated = True
        elda.recluster()

        self.assert_clustering_results_equal(elda.cluster_model.results,
                                             loaded_cluster_model_results)
        assert elda.valid_clusters == loaded_valid_clusters
        np.testing.assert_allclose(elda.get_topics(),
                                   loaded_stable_topics,
                                   rtol=RTOL)
Example #7
0
    def test_multiprocessing(self):
        # same configuration
        random_state = RANDOM_STATE

        # use 3 processes for the ensemble and the distance,
        # so that the 4 models and 8 topics cannot be distributed
        # to each worker evenly
        workers = 3

        # memory friendly. contains List of topic word distributions
        elda = self.get_elda()
        elda_multiprocessing = EnsembleLda(
            corpus=common_corpus,
            id2word=common_dictionary,
            topic_model_class=LdaModel,
            num_topics=NUM_TOPICS,
            passes=PASSES,
            num_models=NUM_MODELS,
            random_state=random_state,
            ensemble_workers=workers,
            distance_workers=workers,
        )

        # memory unfriendly. contains List of models
        elda_mem_unfriendly = self.get_elda_mem_unfriendly()
        elda_multiprocessing_mem_unfriendly = EnsembleLda(
            corpus=common_corpus,
            id2word=common_dictionary,
            topic_model_class=LdaModel,
            num_topics=NUM_TOPICS,
            passes=PASSES,
            num_models=NUM_MODELS,
            random_state=random_state,
            ensemble_workers=workers,
            distance_workers=workers,
            memory_friendly_ttda=False,
        )

        np.testing.assert_allclose(elda.get_topics(),
                                   elda_multiprocessing.get_topics(),
                                   rtol=RTOL)
        np.testing.assert_allclose(
            elda_mem_unfriendly.get_topics(),
            elda_multiprocessing_mem_unfriendly.get_topics(),
            rtol=RTOL)
Example #8
0
    def test_add_and_recluster(self):
        # See if after adding a model, the model still makes sense
        num_new_models = 3
        num_new_topics = 3
        random_state = 1

        # train models two sets of models (mem friendly and unfriendly)
        elda_1 = EnsembleLda(
            corpus=common_corpus,
            id2word=common_dictionary,
            num_topics=num_new_topics,
            passes=10,
            num_models=num_new_models,
            iterations=30,
            random_state=random_state,
            topic_model_class='lda',
            distance_workers=4,
        )
        elda_mem_unfriendly_1 = EnsembleLda(
            corpus=common_corpus,
            id2word=common_dictionary,
            num_topics=num_new_topics,
            passes=10,
            num_models=num_new_models,
            iterations=30,
            random_state=random_state,
            topic_model_class=LdaModel,
            distance_workers=4,
            memory_friendly_ttda=False,
        )
        elda_2 = self.get_elda()
        elda_mem_unfriendly_2 = self.get_elda_mem_unfriendly()
        assert elda_1.random_state != elda_2.random_state
        assert elda_mem_unfriendly_1.random_state != elda_mem_unfriendly_2.random_state

        # both should be similar
        np.testing.assert_allclose(elda_1.ttda,
                                   elda_mem_unfriendly_1.ttda,
                                   rtol=RTOL)
        np.testing.assert_allclose(elda_1.get_topics(),
                                   elda_mem_unfriendly_1.get_topics(),
                                   rtol=RTOL)
        # and every next step applied to both should result in similar results

        # 1. adding to ttda and tms
        elda_1.add_model(elda_2)
        elda_mem_unfriendly_1.add_model(elda_mem_unfriendly_2)

        np.testing.assert_allclose(elda_1.ttda,
                                   elda_mem_unfriendly_1.ttda,
                                   rtol=RTOL)
        assert len(
            elda_1.ttda) == len(elda_2.ttda) + num_new_models * num_new_topics
        assert len(elda_mem_unfriendly_1.ttda) == len(
            elda_mem_unfriendly_2.ttda) + num_new_models * num_new_topics
        assert len(elda_mem_unfriendly_1.tms) == NUM_MODELS + num_new_models
        self.assert_ttda_is_valid(elda_1)
        self.assert_ttda_is_valid(elda_mem_unfriendly_1)

        # 2. distance matrix
        elda_1._generate_asymmetric_distance_matrix()
        elda_mem_unfriendly_1._generate_asymmetric_distance_matrix()
        np.testing.assert_allclose(
            elda_1.asymmetric_distance_matrix,
            elda_mem_unfriendly_1.asymmetric_distance_matrix,
        )

        # 3. CBDBSCAN results
        elda_1._generate_topic_clusters()
        elda_mem_unfriendly_1._generate_topic_clusters()
        clustering_results = elda_1.cluster_model.results
        mem_unfriendly_clustering_results = elda_mem_unfriendly_1.cluster_model.results
        self.assert_clustering_results_equal(
            clustering_results, mem_unfriendly_clustering_results)

        # 4. finally, the stable topics
        elda_1._generate_stable_topics()
        elda_mem_unfriendly_1._generate_stable_topics()
        np.testing.assert_allclose(
            elda_1.get_topics(),
            elda_mem_unfriendly_1.get_topics(),
        )

        elda_1.generate_gensim_representation()
        elda_mem_unfriendly_1.generate_gensim_representation()

        # same random state, hence topics should be still similar
        np.testing.assert_allclose(elda_1.get_topics(),
                                   elda_mem_unfriendly_1.get_topics(),
                                   rtol=RTOL)
Example #9
0
    def test_add_models(self):
        # make sure countings and sizes after adding are correct
        # create new models and add other models to them.

        # there are a ton of configurations for the first parameter possible,
        # try them all

        # quickly train something that can be used for counting results
        num_new_models = 3
        num_new_topics = 3

        # 1. memory friendly
        base_elda = self.get_elda()
        cumulative_elda = EnsembleLda(
            corpus=common_corpus,
            id2word=common_dictionary,
            num_topics=num_new_topics,
            passes=1,
            num_models=num_new_models,
            iterations=1,
            random_state=RANDOM_STATE,
            topic_model_class=LdaMulticore,
            workers=3,
            ensemble_workers=2,
        )

        # 1.1 ttda
        num_topics_before_add_model = len(cumulative_elda.ttda)
        num_models_before_add_model = cumulative_elda.num_models
        cumulative_elda.add_model(base_elda.ttda)
        assert len(cumulative_elda.ttda) == num_topics_before_add_model + len(
            base_elda.ttda)
        assert cumulative_elda.num_models == num_models_before_add_model + 1  # defaults to 1 for one ttda matrix

        # 1.2 an ensemble
        num_topics_before_add_model = len(cumulative_elda.ttda)
        num_models_before_add_model = cumulative_elda.num_models
        cumulative_elda.add_model(base_elda, 5)
        assert len(cumulative_elda.ttda) == num_topics_before_add_model + len(
            base_elda.ttda)
        assert cumulative_elda.num_models == num_models_before_add_model + 5

        # 1.3 a list of ensembles
        num_topics_before_add_model = len(cumulative_elda.ttda)
        num_models_before_add_model = cumulative_elda.num_models
        # it should be totally legit to add a memory unfriendly object to a memory friendly one
        base_elda_mem_unfriendly = self.get_elda_mem_unfriendly()
        cumulative_elda.add_model([base_elda, base_elda_mem_unfriendly])
        assert len(cumulative_elda.ttda
                   ) == num_topics_before_add_model + 2 * len(base_elda.ttda)
        assert cumulative_elda.num_models == num_models_before_add_model + 2 * NUM_MODELS

        # 1.4 a single gensim model
        model = base_elda.classic_model_representation

        num_topics_before_add_model = len(cumulative_elda.ttda)
        num_models_before_add_model = cumulative_elda.num_models
        cumulative_elda.add_model(model)
        assert len(cumulative_elda.ttda) == num_topics_before_add_model + len(
            model.get_topics())
        assert cumulative_elda.num_models == num_models_before_add_model + 1

        # 1.5 a list gensim models
        num_topics_before_add_model = len(cumulative_elda.ttda)
        num_models_before_add_model = cumulative_elda.num_models
        cumulative_elda.add_model([model, model])
        assert len(
            cumulative_elda.ttda
        ) == num_topics_before_add_model + 2 * len(model.get_topics())
        assert cumulative_elda.num_models == num_models_before_add_model + 2

        self.assert_ttda_is_valid(cumulative_elda)

        # 2. memory unfriendly
        elda_mem_unfriendly = EnsembleLda(
            corpus=common_corpus,
            id2word=common_dictionary,
            num_topics=num_new_topics,
            passes=1,
            num_models=num_new_models,
            iterations=1,
            random_state=RANDOM_STATE,
            topic_model_class=LdaMulticore,
            workers=3,
            ensemble_workers=2,
            memory_friendly_ttda=False,
        )

        # 2.1 a single ensemble
        num_topics_before_add_model = len(elda_mem_unfriendly.tms)
        num_models_before_add_model = elda_mem_unfriendly.num_models
        elda_mem_unfriendly.add_model(base_elda_mem_unfriendly)
        assert len(elda_mem_unfriendly.tms
                   ) == num_topics_before_add_model + NUM_MODELS
        assert elda_mem_unfriendly.num_models == num_models_before_add_model + NUM_MODELS

        # 2.2 a list of ensembles
        num_topics_before_add_model = len(elda_mem_unfriendly.tms)
        num_models_before_add_model = elda_mem_unfriendly.num_models
        elda_mem_unfriendly.add_model(
            [base_elda_mem_unfriendly, base_elda_mem_unfriendly])
        assert len(elda_mem_unfriendly.tms
                   ) == num_topics_before_add_model + 2 * NUM_MODELS
        assert elda_mem_unfriendly.num_models == num_models_before_add_model + 2 * NUM_MODELS

        # 2.3 a single gensim model
        num_topics_before_add_model = len(elda_mem_unfriendly.tms)
        num_models_before_add_model = elda_mem_unfriendly.num_models
        elda_mem_unfriendly.add_model(base_elda_mem_unfriendly.tms[0])
        assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model + 1
        assert elda_mem_unfriendly.num_models == num_models_before_add_model + 1

        # 2.4 a list of gensim models
        num_topics_before_add_model = len(elda_mem_unfriendly.tms)
        num_models_before_add_model = elda_mem_unfriendly.num_models
        elda_mem_unfriendly.add_model(base_elda_mem_unfriendly.tms)
        assert len(elda_mem_unfriendly.tms
                   ) == num_topics_before_add_model + NUM_MODELS
        assert elda_mem_unfriendly.num_models == num_models_before_add_model + NUM_MODELS

        # 2.5 topic term distributions should throw errors, because the
        # actual models are needed for the memory unfriendly ensemble
        num_topics_before_add_model = len(elda_mem_unfriendly.tms)
        num_models_before_add_model = elda_mem_unfriendly.num_models
        with pytest.raises(ValueError):
            elda_mem_unfriendly.add_model(
                base_elda_mem_unfriendly.tms[0].get_topics())
        # remains unchanged
        assert len(elda_mem_unfriendly.tms) == num_topics_before_add_model
        assert elda_mem_unfriendly.num_models == num_models_before_add_model

        assert elda_mem_unfriendly.num_models == len(elda_mem_unfriendly.tms)
        self.assert_ttda_is_valid(elda_mem_unfriendly)
Example #10
0
    def test_add_models_to_empty(self):
        elda = self.get_elda()

        ensemble = EnsembleLda(id2word=common_dictionary, num_models=0)
        ensemble.add_model(elda.ttda[0:1])
        ensemble.add_model(elda.ttda[1:])
        ensemble.recluster()
        np.testing.assert_allclose(ensemble.get_topics(),
                                   elda.get_topics(),
                                   rtol=RTOL)

        # persisting an ensemble that is entirely built from existing ttdas
        fname = get_tmpfile('gensim_models_ensemblelda')
        ensemble.save(fname)
        loaded_ensemble = EnsembleLda.load(fname)
        np.testing.assert_allclose(loaded_ensemble.get_topics(),
                                   elda.get_topics(),
                                   rtol=RTOL)
        self.test_inference(loaded_ensemble)