Example #1
0
 def test_topic_scores(self):
     testcoherence = jamesLDA.buildCoherenceModel(TESTMODEL, TESTCORPUS)
     topiccoherence = testcoherence.get_coherence_per_topic()
     self.assertIsInstance(topiccoherence[0], numpy.float64)
     self.assertGreater(topiccoherence[0], 0)
     self.assertLess(topiccoherence[0], 1)
     self.assertIsInstance(topiccoherence[1], numpy.float64)
     self.assertGreater(topiccoherence[1], 0)
     self.assertLess(topiccoherence[1], 1)
     self.assertEqual(len(topiccoherence), 2)
Example #2
0
def process(inputCorpus, topicNum, datasetChoice):
    '''
    Main method called by server.py to handle processing of input corpus

    Parameters
    ----------
            inputCorpus: inputCorpus
                    the corpus uploaded by the user as an inputCorpus object (imported
                    from jamesClasses)

            topicNum: int
                    the number of topics to be generated by the topic model, provided by the user

    Output
    ------
            dict
                    the result set as a dictionary containing nested dictionaries, to be easily
                    converted to a json object
    '''
    # Pre process input corpus using preProcess method imported from jamesPreProcessing
    # Input is inputCorpus object, imported from jamesClasses
    # Output is jamesCorpus object, imported from jamesClasses
    corpus = preProcess(inputCorpus)
    # Raise an error if the input text is too short for the number of topics
    assert len(corpus.dic
               ) > topicNum, "Input is too short for number of selected topics"
    # Load the user-selected sentiment model using getPredictor, imported from jamesSA
    modelInfo = cfg['path'][datasetChoice]
    sentimentmodel, tokenizer = getPredictor(modelInfo[0], modelInfo[1],
                                             modelInfo[2], modelInfo[3])
    # Build the topic model on the corpus using the input number of topics
    topicModel = buildTopicModel(corpus, topicNum)
    # Build the coherence model for generated topic model
    coherenceModel = buildCoherenceModel(topicModel, corpus)
    # Produce a jamesResults object, imported from jamesClasses, containing the topic
    #   model information using getResults, imported from jamesLDA
    results = getResults(topicModel, coherenceModel, corpus)
    # Add the stem dictionary produced in preprocessing to the jamesResults object
    # Words are stemmed for topic modeling, but a dictionary is kept mapping each stem
    #   to a word converted to that stem, which is used to make results more readable
    results.addStemDic(corpus.stemDic)
    # Iterate through each document in the corpus for analysis
    for doc in corpus.docs:
        # Construct a docResults object, imported from jamesClasses, containing the topic
        #   breakdown compared to the constructed topic model for the document in question
        #   produced by getTopics, imported from jamesLDA
        docResult = docResults(doc.title, getTopics(doc.bow, topicModel))
        # Iterate through each sentence in the document for sentiment analysis
        for sentence in doc.sentences:
            # Preprocess the sentence for topic modeling
            processedSentence = preProcessSentence(sentence, corpus.dic)
            # Skip any sentence that has no words in the bag of words after preprocessing
            if len(processedSentence) > 0:
                # Use the constructed topic model to find the topic distribution for the
                #   current sentence using getTopics, imported from jamesLDA
                sentenceTopics = getTopics(processedSentence, topicModel)
                # Skip over any sentence that cannot be matched to any topic
                if len(sentenceTopics) > 0:
                    # Check to see if this sentence would be a good example sentence for any
                    #    topic, and add this sentence to the topic if so
                    results.addSentence(sentence, sentenceTopics)
                    # Use the sentiment analysis model to find the sentiment for the current
                    #   sentence using getSentenceSentiment, imported from jamesSA
                    sentenceSentiment = getSentenceSentiment(
                        sentimentmodel, [sentence], tokenizer, modelInfo[4])
                    # Add the sentence's sentiment to each topic's sentiment for the current
                    #   document results docResults object, weighted by the sentence's topic
                    #   distribution
                    for topic in sentenceTopics:
                        docResult.addSentiment(topic[0], topic[1],
                                               sentenceSentiment)
        # Calculate the average sentiment for each topic in the current document
        docResult.averageSentiments()
        # Add the docResults object to the jamesResults result set
        results.addDocResults(docResult)
    # Use the jamesResults output method to output results as nested dictionaries
    #   and lists, which can be converted to a json object
    return results.output()
Example #3
0
 def test_model_score(self):
     testcoherence = jamesLDA.buildCoherenceModel(TESTMODEL, TESTCORPUS)
     modelcoherence = testcoherence.get_coherence()
     self.assertIsInstance(modelcoherence, numpy.float64)
     self.assertGreater(modelcoherence, 0)
     self.assertLess(modelcoherence, 1)
Example #4
0
 def test_coherence_build(self):
     testcoherence = jamesLDA.buildCoherenceModel(TESTMODEL, TESTCORPUS)
     self.assertIsInstance(testcoherence,
                           gensim.models.coherencemodel.CoherenceModel)
Example #5
0
 def test_model_is_average(self):
     testcoherence = jamesLDA.buildCoherenceModel(TESTMODEL, TESTCORPUS)
     modelcoherence = testcoherence.get_coherence()
     topiccoherence = testcoherence.get_coherence_per_topic()
     self.assertEqual(modelcoherence,
                      sum(topiccoherence) / len(topiccoherence))
Example #6
0
#    a file in testdata
def loadTestSentence(testCorpus, number=""):
    filename = os.path.join(os.path.dirname(os.path.dirname(__file__)),
                            'testdata', 'testsentence' + number + '.txt')
    f = open(filename, 'r')
    rawSentence = f.read()
    f.close()
    testSentence = jamesPreProcessing.preProcessSentence(
        rawSentence, testCorpus.dic)
    return testSentence


# Load the test corpus, test model, and test sentence into constants
TESTCORPUS = loadTestCorpus()
TESTMODEL = jamesLDA.buildTopicModel(TESTCORPUS, 2)
TESTCOHERENCE = jamesLDA.buildCoherenceModel(TESTMODEL, TESTCORPUS)
TESTSENTENCEONE = loadTestSentence(TESTCORPUS, "one")
TESTSENTENCETWO = loadTestSentence(TESTCORPUS, "two")


# Tests for buildTopicModel method in jamesLDA
class TestJamesLDA_buildTopicModel(unittest.TestCase):
    # Test that a topic model is built successfully
    def test_model_build(self):
        testmodel = jamesLDA.buildTopicModel(TESTCORPUS, 3)
        self.assertIsInstance(testmodel, gensim.models.ldamodel.LdaModel)

    # Test that the correct number of topics are constructed
    def test_correct_topic_num(self):
        for i in range(1, 4):
            testmodel = jamesLDA.buildTopicModel(TESTCORPUS, i)