def test_topic_scores(self): testcoherence = jamesLDA.buildCoherenceModel(TESTMODEL, TESTCORPUS) topiccoherence = testcoherence.get_coherence_per_topic() self.assertIsInstance(topiccoherence[0], numpy.float64) self.assertGreater(topiccoherence[0], 0) self.assertLess(topiccoherence[0], 1) self.assertIsInstance(topiccoherence[1], numpy.float64) self.assertGreater(topiccoherence[1], 0) self.assertLess(topiccoherence[1], 1) self.assertEqual(len(topiccoherence), 2)
def process(inputCorpus, topicNum, datasetChoice): ''' Main method called by server.py to handle processing of input corpus Parameters ---------- inputCorpus: inputCorpus the corpus uploaded by the user as an inputCorpus object (imported from jamesClasses) topicNum: int the number of topics to be generated by the topic model, provided by the user Output ------ dict the result set as a dictionary containing nested dictionaries, to be easily converted to a json object ''' # Pre process input corpus using preProcess method imported from jamesPreProcessing # Input is inputCorpus object, imported from jamesClasses # Output is jamesCorpus object, imported from jamesClasses corpus = preProcess(inputCorpus) # Raise an error if the input text is too short for the number of topics assert len(corpus.dic ) > topicNum, "Input is too short for number of selected topics" # Load the user-selected sentiment model using getPredictor, imported from jamesSA modelInfo = cfg['path'][datasetChoice] sentimentmodel, tokenizer = getPredictor(modelInfo[0], modelInfo[1], modelInfo[2], modelInfo[3]) # Build the topic model on the corpus using the input number of topics topicModel = buildTopicModel(corpus, topicNum) # Build the coherence model for generated topic model coherenceModel = buildCoherenceModel(topicModel, corpus) # Produce a jamesResults object, imported from jamesClasses, containing the topic # model information using getResults, imported from jamesLDA results = getResults(topicModel, coherenceModel, corpus) # Add the stem dictionary produced in preprocessing to the jamesResults object # Words are stemmed for topic modeling, but a dictionary is kept mapping each stem # to a word converted to that stem, which is used to make results more readable results.addStemDic(corpus.stemDic) # Iterate through each document in the corpus for analysis for doc in corpus.docs: # Construct a docResults object, imported from jamesClasses, containing the topic # breakdown compared to the constructed topic model for the document in question # produced by getTopics, imported from jamesLDA docResult = docResults(doc.title, getTopics(doc.bow, topicModel)) # Iterate through each sentence in the document for sentiment analysis for sentence in doc.sentences: # Preprocess the sentence for topic modeling processedSentence = preProcessSentence(sentence, corpus.dic) # Skip any sentence that has no words in the bag of words after preprocessing if len(processedSentence) > 0: # Use the constructed topic model to find the topic distribution for the # current sentence using getTopics, imported from jamesLDA sentenceTopics = getTopics(processedSentence, topicModel) # Skip over any sentence that cannot be matched to any topic if len(sentenceTopics) > 0: # Check to see if this sentence would be a good example sentence for any # topic, and add this sentence to the topic if so results.addSentence(sentence, sentenceTopics) # Use the sentiment analysis model to find the sentiment for the current # sentence using getSentenceSentiment, imported from jamesSA sentenceSentiment = getSentenceSentiment( sentimentmodel, [sentence], tokenizer, modelInfo[4]) # Add the sentence's sentiment to each topic's sentiment for the current # document results docResults object, weighted by the sentence's topic # distribution for topic in sentenceTopics: docResult.addSentiment(topic[0], topic[1], sentenceSentiment) # Calculate the average sentiment for each topic in the current document docResult.averageSentiments() # Add the docResults object to the jamesResults result set results.addDocResults(docResult) # Use the jamesResults output method to output results as nested dictionaries # and lists, which can be converted to a json object return results.output()
def test_model_score(self): testcoherence = jamesLDA.buildCoherenceModel(TESTMODEL, TESTCORPUS) modelcoherence = testcoherence.get_coherence() self.assertIsInstance(modelcoherence, numpy.float64) self.assertGreater(modelcoherence, 0) self.assertLess(modelcoherence, 1)
def test_coherence_build(self): testcoherence = jamesLDA.buildCoherenceModel(TESTMODEL, TESTCORPUS) self.assertIsInstance(testcoherence, gensim.models.coherencemodel.CoherenceModel)
def test_model_is_average(self): testcoherence = jamesLDA.buildCoherenceModel(TESTMODEL, TESTCORPUS) modelcoherence = testcoherence.get_coherence() topiccoherence = testcoherence.get_coherence_per_topic() self.assertEqual(modelcoherence, sum(topiccoherence) / len(topiccoherence))
# a file in testdata def loadTestSentence(testCorpus, number=""): filename = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testdata', 'testsentence' + number + '.txt') f = open(filename, 'r') rawSentence = f.read() f.close() testSentence = jamesPreProcessing.preProcessSentence( rawSentence, testCorpus.dic) return testSentence # Load the test corpus, test model, and test sentence into constants TESTCORPUS = loadTestCorpus() TESTMODEL = jamesLDA.buildTopicModel(TESTCORPUS, 2) TESTCOHERENCE = jamesLDA.buildCoherenceModel(TESTMODEL, TESTCORPUS) TESTSENTENCEONE = loadTestSentence(TESTCORPUS, "one") TESTSENTENCETWO = loadTestSentence(TESTCORPUS, "two") # Tests for buildTopicModel method in jamesLDA class TestJamesLDA_buildTopicModel(unittest.TestCase): # Test that a topic model is built successfully def test_model_build(self): testmodel = jamesLDA.buildTopicModel(TESTCORPUS, 3) self.assertIsInstance(testmodel, gensim.models.ldamodel.LdaModel) # Test that the correct number of topics are constructed def test_correct_topic_num(self): for i in range(1, 4): testmodel = jamesLDA.buildTopicModel(TESTCORPUS, i)