Esempio n. 1
0
def process(inputCorpus, topicNum, datasetChoice):
    '''
    Main method called by server.py to handle processing of input corpus

    Parameters
    ----------
            inputCorpus: inputCorpus
                    the corpus uploaded by the user as an inputCorpus object (imported
                    from jamesClasses)

            topicNum: int
                    the number of topics to be generated by the topic model, provided by the user

    Output
    ------
            dict
                    the result set as a dictionary containing nested dictionaries, to be easily
                    converted to a json object
    '''
    # Pre process input corpus using preProcess method imported from jamesPreProcessing
    # Input is inputCorpus object, imported from jamesClasses
    # Output is jamesCorpus object, imported from jamesClasses
    corpus = preProcess(inputCorpus)
    # Raise an error if the input text is too short for the number of topics
    assert len(corpus.dic
               ) > topicNum, "Input is too short for number of selected topics"
    # Load the user-selected sentiment model using getPredictor, imported from jamesSA
    modelInfo = cfg['path'][datasetChoice]
    sentimentmodel, tokenizer = getPredictor(modelInfo[0], modelInfo[1],
                                             modelInfo[2], modelInfo[3])
    # Build the topic model on the corpus using the input number of topics
    topicModel = buildTopicModel(corpus, topicNum)
    # Build the coherence model for generated topic model
    coherenceModel = buildCoherenceModel(topicModel, corpus)
    # Produce a jamesResults object, imported from jamesClasses, containing the topic
    #   model information using getResults, imported from jamesLDA
    results = getResults(topicModel, coherenceModel, corpus)
    # Add the stem dictionary produced in preprocessing to the jamesResults object
    # Words are stemmed for topic modeling, but a dictionary is kept mapping each stem
    #   to a word converted to that stem, which is used to make results more readable
    results.addStemDic(corpus.stemDic)
    # Iterate through each document in the corpus for analysis
    for doc in corpus.docs:
        # Construct a docResults object, imported from jamesClasses, containing the topic
        #   breakdown compared to the constructed topic model for the document in question
        #   produced by getTopics, imported from jamesLDA
        docResult = docResults(doc.title, getTopics(doc.bow, topicModel))
        # Iterate through each sentence in the document for sentiment analysis
        for sentence in doc.sentences:
            # Preprocess the sentence for topic modeling
            processedSentence = preProcessSentence(sentence, corpus.dic)
            # Skip any sentence that has no words in the bag of words after preprocessing
            if len(processedSentence) > 0:
                # Use the constructed topic model to find the topic distribution for the
                #   current sentence using getTopics, imported from jamesLDA
                sentenceTopics = getTopics(processedSentence, topicModel)
                # Skip over any sentence that cannot be matched to any topic
                if len(sentenceTopics) > 0:
                    # Check to see if this sentence would be a good example sentence for any
                    #    topic, and add this sentence to the topic if so
                    results.addSentence(sentence, sentenceTopics)
                    # Use the sentiment analysis model to find the sentiment for the current
                    #   sentence using getSentenceSentiment, imported from jamesSA
                    sentenceSentiment = getSentenceSentiment(
                        sentimentmodel, [sentence], tokenizer, modelInfo[4])
                    # Add the sentence's sentiment to each topic's sentiment for the current
                    #   document results docResults object, weighted by the sentence's topic
                    #   distribution
                    for topic in sentenceTopics:
                        docResult.addSentiment(topic[0], topic[1],
                                               sentenceSentiment)
        # Calculate the average sentiment for each topic in the current document
        docResult.averageSentiments()
        # Add the docResults object to the jamesResults result set
        results.addDocResults(docResult)
    # Use the jamesResults output method to output results as nested dictionaries
    #   and lists, which can be converted to a json object
    return results.output()
Esempio n. 2
0
 def test_topic_identification(self):
     testtopicsone = jamesLDA.getTopics(TESTSENTENCEONE, TESTMODEL)
     testtopicstwo = jamesLDA.getTopics(TESTSENTENCETWO, TESTMODEL)
     self.assertGreater(testtopicsone[0][1], testtopicstwo[0][1])
     self.assertLess(testtopicsone[1][1], testtopicstwo[1][1])
Esempio n. 3
0
 def test_correct_topic_number(self):
     testtopics = jamesLDA.getTopics(TESTSENTENCEONE, TESTMODEL)
     self.assertEqual(len(testtopics), 2)
Esempio n. 4
0
 def test_probabilities_sum_to_one(self):
     testtopics = jamesLDA.getTopics(TESTSENTENCEONE, TESTMODEL)
     self.assertEqual(testtopics[0][1] + testtopics[1][1], 1)
Esempio n. 5
0
 def test_produces_topics(self):
     testtopics = jamesLDA.getTopics(TESTSENTENCEONE, TESTMODEL)
     self.assertIsInstance(testtopics, list)