def handle(self, dataset_id, *args, **options): num_topics = options.get('num_topics') name = options.get('name') if not dataset_id: raise CommandError("Dataset id is required.") try: dataset_id = int(dataset_id) except ValueError: raise CommandError("Dataset id must be a number.") from pyanalysis.apps.enhance.tasks import default_topic_context, standard_topic_pipeline context = default_topic_context(name, dataset_id=dataset_id) standard_topic_pipeline(context, dataset_id=dataset_id, num_topics=int(num_topics))
def test_topic_modeling(self): """Generate some test messages and actually model topics""" randomseed = 1 from numpy import random as nprandom import random nprandom.seed(randomseed) random.seed(randomseed) n_messages = 50 n_words = 5 num_topics = 2 topic_a_vocab = ['cat', 'hat', 'marbles'] topic_b_vocab = ['oppossum', 'lasso', 'amalgam'] total_words = len(topic_a_vocab) + len(topic_b_vocab) english, created = corpus_models.Language.objects.get_or_create(code='en', name="English") for i in xrange(n_messages): if i < n_messages / 2: vocab = topic_a_vocab else: vocab = topic_b_vocab self.dataset.message_set.create( text=" ".join(random.choice(vocab) for w in xrange(n_words)), language=english, ) context = tasks.default_topic_context("test_topic_modeling", dataset_id=self.dataset.id) tasks.standard_topic_pipeline(context, dataset_id=self.dataset.id, num_topics=num_topics, multicore=False) dictionary = models.Dictionary.objects.get(name='test_topic_modeling') # Check the basic stats of the dictionary self.assertEquals(dictionary.num_docs, n_messages) self.assertEquals(dictionary.num_pos, n_messages * n_words) # Check the links to words self.assertEquals(dictionary.words.count(), total_words) # Check the links to topic models self.assertEquals(dictionary.topicmodel_set.count(), 1) topic_model = dictionary.topicmodel_set.first() self.assertLess(topic_model.perplexity, -1) # Check the number of topics self.assertEquals(topic_model.topics.count(), num_topics) topics = topic_model.topics.all() topic_a = topics[0] topic_b = topics[1] # Should be the proper number of messages with positive topic probabilities # The - 3 factor on the end allows a little wiggle room for randomness # Count the messages that prefer each topic from collections import defaultdict topic_count = defaultdict(int) for msg in self.dataset.message_set.all(): topic = topic_model.get_probable_topic(msg) topic_count[topic] += 1 self.assertEquals(topic_count, { topic_a: n_messages / 2, topic_b: n_messages / 2, }) # The words should be in the default topic names for word in topic_a_vocab + topic_b_vocab: self.assertTrue(word in topic_a.name or word in topic_b.name)