def test_results_topic_words_have_attributes(self): testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS) topicnumbers = [] for topic in testresults.topicResults: for word in topic.topicWords: self.assertIsInstance(word.word, str) self.assertIsInstance(word.weight, float)
def test_results_correct_topic_numbers(self): testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS) topicnumbers = [] for topic in testresults.topicResults: topicnumbers.append(topic.topicNum) topicnumbers.sort() self.assertEqual(topicnumbers, [1, 2])
def process(inputCorpus, topicNum, datasetChoice): ''' Main method called by server.py to handle processing of input corpus Parameters ---------- inputCorpus: inputCorpus the corpus uploaded by the user as an inputCorpus object (imported from jamesClasses) topicNum: int the number of topics to be generated by the topic model, provided by the user Output ------ dict the result set as a dictionary containing nested dictionaries, to be easily converted to a json object ''' # Pre process input corpus using preProcess method imported from jamesPreProcessing # Input is inputCorpus object, imported from jamesClasses # Output is jamesCorpus object, imported from jamesClasses corpus = preProcess(inputCorpus) # Raise an error if the input text is too short for the number of topics assert len(corpus.dic ) > topicNum, "Input is too short for number of selected topics" # Load the user-selected sentiment model using getPredictor, imported from jamesSA modelInfo = cfg['path'][datasetChoice] sentimentmodel, tokenizer = getPredictor(modelInfo[0], modelInfo[1], modelInfo[2], modelInfo[3]) # Build the topic model on the corpus using the input number of topics topicModel = buildTopicModel(corpus, topicNum) # Build the coherence model for generated topic model coherenceModel = buildCoherenceModel(topicModel, corpus) # Produce a jamesResults object, imported from jamesClasses, containing the topic # model information using getResults, imported from jamesLDA results = getResults(topicModel, coherenceModel, corpus) # Add the stem dictionary produced in preprocessing to the jamesResults object # Words are stemmed for topic modeling, but a dictionary is kept mapping each stem # to a word converted to that stem, which is used to make results more readable results.addStemDic(corpus.stemDic) # Iterate through each document in the corpus for analysis for doc in corpus.docs: # Construct a docResults object, imported from jamesClasses, containing the topic # breakdown compared to the constructed topic model for the document in question # produced by getTopics, imported from jamesLDA docResult = docResults(doc.title, getTopics(doc.bow, topicModel)) # Iterate through each sentence in the document for sentiment analysis for sentence in doc.sentences: # Preprocess the sentence for topic modeling processedSentence = preProcessSentence(sentence, corpus.dic) # Skip any sentence that has no words in the bag of words after preprocessing if len(processedSentence) > 0: # Use the constructed topic model to find the topic distribution for the # current sentence using getTopics, imported from jamesLDA sentenceTopics = getTopics(processedSentence, topicModel) # Skip over any sentence that cannot be matched to any topic if len(sentenceTopics) > 0: # Check to see if this sentence would be a good example sentence for any # topic, and add this sentence to the topic if so results.addSentence(sentence, sentenceTopics) # Use the sentiment analysis model to find the sentiment for the current # sentence using getSentenceSentiment, imported from jamesSA sentenceSentiment = getSentenceSentiment( sentimentmodel, [sentence], tokenizer, modelInfo[4]) # Add the sentence's sentiment to each topic's sentiment for the current # document results docResults object, weighted by the sentence's topic # distribution for topic in sentenceTopics: docResult.addSentiment(topic[0], topic[1], sentenceSentiment) # Calculate the average sentiment for each topic in the current document docResult.averageSentiments() # Add the docResults object to the jamesResults result set results.addDocResults(docResult) # Use the jamesResults output method to output results as nested dictionaries # and lists, which can be converted to a json object return results.output()
def test_model_coherence_nonzero(self): testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS) self.assertTrue(testresults.modelCoherence <= 0.0 or testresults.modelCoherence >= 0.0)
def test_model_coherence_produced(self): testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS) self.assertIsInstance(testresults.modelCoherence, float)
def test_results_correct_word_count(self): testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS) for topic in testresults.topicResults: self.assertEqual(len(topic.topicWords), cfg['topicwords'])
def test_results_topic_words_are_words(self): testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS) topicnumbers = [] for topic in testresults.topicResults: for word in topic.topicWords: self.assertIsInstance(word, jamesClasses.topicWord)
def test_results_topics_have_words(self): testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS) topicnumbers = [] for topic in testresults.topicResults: self.assertIsNot(topic.topicWords, [])
def test_results_correct_topic_count(self): testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS) self.assertEqual(len(testresults.topicResults), 2)
def test_results_topics_are_topics(self): testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS) for topic in testresults.topicResults: self.assertIsInstance(topic, jamesClasses.topicResults)
def test_results_has_topics(self): testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS) self.assertIsNot(testresults.topicResults, [])
def test_results_documentResults_empty(self): testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS) self.assertEqual(testresults.documentResults, [])
def test_results_stemDic_empty(self): testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS) self.assertEqual(testresults.stemDic, {})
def test_produces_results(self): testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS) self.assertIsInstance(testresults, jamesClasses.jamesResults)