Beispiel #1
0
 def test_results_topic_words_have_attributes(self):
     testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS)
     topicnumbers = []
     for topic in testresults.topicResults:
         for word in topic.topicWords:
             self.assertIsInstance(word.word, str)
             self.assertIsInstance(word.weight, float)
Beispiel #2
0
 def test_results_correct_topic_numbers(self):
     testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS)
     topicnumbers = []
     for topic in testresults.topicResults:
         topicnumbers.append(topic.topicNum)
     topicnumbers.sort()
     self.assertEqual(topicnumbers, [1, 2])
Beispiel #3
0
def process(inputCorpus, topicNum, datasetChoice):
    '''
    Main method called by server.py to handle processing of input corpus

    Parameters
    ----------
            inputCorpus: inputCorpus
                    the corpus uploaded by the user as an inputCorpus object (imported
                    from jamesClasses)

            topicNum: int
                    the number of topics to be generated by the topic model, provided by the user

    Output
    ------
            dict
                    the result set as a dictionary containing nested dictionaries, to be easily
                    converted to a json object
    '''
    # Pre process input corpus using preProcess method imported from jamesPreProcessing
    # Input is inputCorpus object, imported from jamesClasses
    # Output is jamesCorpus object, imported from jamesClasses
    corpus = preProcess(inputCorpus)
    # Raise an error if the input text is too short for the number of topics
    assert len(corpus.dic
               ) > topicNum, "Input is too short for number of selected topics"
    # Load the user-selected sentiment model using getPredictor, imported from jamesSA
    modelInfo = cfg['path'][datasetChoice]
    sentimentmodel, tokenizer = getPredictor(modelInfo[0], modelInfo[1],
                                             modelInfo[2], modelInfo[3])
    # Build the topic model on the corpus using the input number of topics
    topicModel = buildTopicModel(corpus, topicNum)
    # Build the coherence model for generated topic model
    coherenceModel = buildCoherenceModel(topicModel, corpus)
    # Produce a jamesResults object, imported from jamesClasses, containing the topic
    #   model information using getResults, imported from jamesLDA
    results = getResults(topicModel, coherenceModel, corpus)
    # Add the stem dictionary produced in preprocessing to the jamesResults object
    # Words are stemmed for topic modeling, but a dictionary is kept mapping each stem
    #   to a word converted to that stem, which is used to make results more readable
    results.addStemDic(corpus.stemDic)
    # Iterate through each document in the corpus for analysis
    for doc in corpus.docs:
        # Construct a docResults object, imported from jamesClasses, containing the topic
        #   breakdown compared to the constructed topic model for the document in question
        #   produced by getTopics, imported from jamesLDA
        docResult = docResults(doc.title, getTopics(doc.bow, topicModel))
        # Iterate through each sentence in the document for sentiment analysis
        for sentence in doc.sentences:
            # Preprocess the sentence for topic modeling
            processedSentence = preProcessSentence(sentence, corpus.dic)
            # Skip any sentence that has no words in the bag of words after preprocessing
            if len(processedSentence) > 0:
                # Use the constructed topic model to find the topic distribution for the
                #   current sentence using getTopics, imported from jamesLDA
                sentenceTopics = getTopics(processedSentence, topicModel)
                # Skip over any sentence that cannot be matched to any topic
                if len(sentenceTopics) > 0:
                    # Check to see if this sentence would be a good example sentence for any
                    #    topic, and add this sentence to the topic if so
                    results.addSentence(sentence, sentenceTopics)
                    # Use the sentiment analysis model to find the sentiment for the current
                    #   sentence using getSentenceSentiment, imported from jamesSA
                    sentenceSentiment = getSentenceSentiment(
                        sentimentmodel, [sentence], tokenizer, modelInfo[4])
                    # Add the sentence's sentiment to each topic's sentiment for the current
                    #   document results docResults object, weighted by the sentence's topic
                    #   distribution
                    for topic in sentenceTopics:
                        docResult.addSentiment(topic[0], topic[1],
                                               sentenceSentiment)
        # Calculate the average sentiment for each topic in the current document
        docResult.averageSentiments()
        # Add the docResults object to the jamesResults result set
        results.addDocResults(docResult)
    # Use the jamesResults output method to output results as nested dictionaries
    #   and lists, which can be converted to a json object
    return results.output()
Beispiel #4
0
 def test_model_coherence_nonzero(self):
     testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS)
     self.assertTrue(testresults.modelCoherence <= 0.0
                     or testresults.modelCoherence >= 0.0)
Beispiel #5
0
 def test_model_coherence_produced(self):
     testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS)
     self.assertIsInstance(testresults.modelCoherence, float)
Beispiel #6
0
 def test_results_correct_word_count(self):
     testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS)
     for topic in testresults.topicResults:
         self.assertEqual(len(topic.topicWords), cfg['topicwords'])
Beispiel #7
0
 def test_results_topic_words_are_words(self):
     testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS)
     topicnumbers = []
     for topic in testresults.topicResults:
         for word in topic.topicWords:
             self.assertIsInstance(word, jamesClasses.topicWord)
Beispiel #8
0
 def test_results_topics_have_words(self):
     testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS)
     topicnumbers = []
     for topic in testresults.topicResults:
         self.assertIsNot(topic.topicWords, [])
Beispiel #9
0
 def test_results_correct_topic_count(self):
     testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS)
     self.assertEqual(len(testresults.topicResults), 2)
Beispiel #10
0
 def test_results_topics_are_topics(self):
     testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS)
     for topic in testresults.topicResults:
         self.assertIsInstance(topic, jamesClasses.topicResults)
Beispiel #11
0
 def test_results_has_topics(self):
     testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS)
     self.assertIsNot(testresults.topicResults, [])
Beispiel #12
0
 def test_results_documentResults_empty(self):
     testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS)
     self.assertEqual(testresults.documentResults, [])
Beispiel #13
0
 def test_results_stemDic_empty(self):
     testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS)
     self.assertEqual(testresults.stemDic, {})
Beispiel #14
0
 def test_produces_results(self):
     testresults = jamesLDA.getResults(TESTMODEL, TESTCOHERENCE, TESTCORPUS)
     self.assertIsInstance(testresults, jamesClasses.jamesResults)