Esempio n. 1
0
 def CompareAngels(angel1, angel2, sentences):
     if angel1 is None or angel2 is None:
         return
     label1 = Sentiment.GetSentimentClass(
         angel1.PredictReviewScore(sentences))
     label2 = Sentiment.GetSentimentClass(
         angel2.PredictReviewScore(sentences))
     if label1 != label2:
         angel1.DumpDetails(sentences)
         angel2.DumpDetails(sentences)
 def predict(self, filePath):
     #PREDICT
     lexicon = util.LoadLexiconFromCSV(
         "../files/lexicons/SentiWordNet_Lexicon_concise.csv")
     angel = Angel(lexicon, True)
     parsedReviewsPath = os.path.join(os.path.dirname(filePath),
                                      "YelpParsedReviews.json")
     with open(parsedReviewsPath, 'r') as file:
         TrainingFile = file.read()
     classificationData = json.loads(TrainingFile)
     for k in range(len(classificationData["ClassificationModel"])):
         current = classificationData["ClassificationModel"][str(k + 1)]
         notCount = current["NotCount"]
         if "Sentences" in current:
             if not isinstance(current["Sentences"], list):
                 current["Sentences"] = [current["Sentences"]]
             sentences = current["Sentences"]
         else:
             continue
         current["Label"] = Sentiment.GetSentimentClass(
             angel.PredictReviewScore(sentences, notCount), 1)
         angel.DumpDetails(sentences, current["Label"])
     return classificationData
    def PerformTest(self):
        """
        This method loads the test data file, and tests how good the prediction is.
        It also prints the precision, recall and F1 scores.
        """
        angel = Angel(self.lexicon, True)
        angel.SetDumpParameters(7, -7)
        posx, negx, neutx, accx, = 0, 0, 0, 0
        maxnegf1 = maxneutf1 = maxposf1 = maxacc = 0
        for threshold in range(1, 0, -1):
            predictedOverall = []
            expectedSentiment = []
            demons = TotPos = TotNeg = TotNeut = 0
            while True:
                try:
                    sentences, label, notCount, docId = self.NextElement()
                    if not sentences:
                        continue
                    if label == 'NULL':
                        break
                    label = int(label)
                    expectedSentiment.append(label)
                    predicted = angel.PredictReviewScore(sentences, label)
                    predictedOverall.append(Sentiment.GetSentimentClass(predicted, threshold))
                    if label == Sentiment.POSITIVE:
                        TotPos += 1
                    elif label == Sentiment.NEGATIVE:
                        TotNeg += 1
                    else:
                        TotNeut += 1
                    if angel.DumpRequested(predicted, label):
                        print "ID", docId, "\n"
                        demons += 1
                except StopIteration:
                    break

            print "Demons:", demons
            pos_prec = util.precision_with_class(predictedOverall, expectedSentiment, 1)
            neg_prec = util.precision_with_class(predictedOverall, expectedSentiment, -1)
            neut_prec = util.precision_with_class(predictedOverall, expectedSentiment, 0)
            pos_rec = util.recall_with_class(predictedOverall, expectedSentiment, 1)
            neg_rec = util.recall_with_class(predictedOverall, expectedSentiment, -1)
            neut_rec = util.recall_with_class(predictedOverall, expectedSentiment, 0)
            pos_f1 = util.f1_with_class(predictedOverall, expectedSentiment, 1)
            neg_f1 = util.f1_with_class(predictedOverall, expectedSentiment, -1)
            neut_f1 = util.f1_with_class(predictedOverall, expectedSentiment, 0)
            accuracy = util.accuracy(predictedOverall, expectedSentiment)
            print "Current Positive stats (", threshold, "): ","\t", '{:.2%}'.format(pos_prec), \
                "\t", '{:.2%}'.format(pos_rec), "\t", '{:.2%}'.format(pos_f1)
            print "Current Negative stats (", threshold, "): ", "\t", '{:.2%}'.format(neg_prec), "\t", \
                '{:.2%}'.format(neg_rec), "\t", '{:.2%}'.format(neg_f1)
            print "Current Neutral stats (", threshold, "): ", "\t", '{:.2%}'.format(neut_prec), "\t", \
                '{:.2%}'.format(neut_rec), "\t", '{:.2%}'.format(neut_f1)
            cprint("Current Accuracy ( " + str(threshold) + " ):\t\t\t" + '{:.2%}'.format(accuracy), 'red')
            if pos_f1 > maxposf1:
                maxposf1 = pos_f1
                posx = threshold
            if neg_f1 > maxnegf1:
                maxnegf1 = neg_f1
                negx = threshold
            if neut_f1 > maxneutf1:
                maxneutf1 = neut_f1
                neutx = threshold
            if accuracy > maxacc:
                maxacc = accuracy
                accx = threshold
        print "Maximum Positive F1: ", '{:.2%}'.format(maxposf1), "at", posx
        print "Maximum Negative F1: ", '{:.2%}'.format(maxnegf1), "at", negx
        print "Maximum Neutral F1: ", '{:.2%}'.format(maxneutf1), "at", neutx
        cprint("Maximum Accuracy: " + '{:.2%}'.format(maxacc) + " at " + str(accx), 'red')
Esempio n. 4
0
def ImpactTraining(docPath, lexPath, lexiconID):
    """
    Final score of the review is calculated as follows:
    (Score1*Multiplier1 + Score2*Multiplier2 ... ScoreN*MultiplierN) * BaseMultiplier = ReviewScore
    Ignoring BaseMultiplier for this training, assuming it has minimal impact (TODO: test this impact)
    ScoreK*MultiplierK/ReviewScore * 100 = PercentImpact (impact %age of word K on the final score)
    TotalAdjustment = Expected Score - FinalScore
    AdjustmentK = PercentImpact of TotalAdjustment  (total adjustment needed for word K)
    Adjustment on word K for this review = AdjustmentK/MultiplierK
    Take the mean of all adjustments, and applying to the final lexicon, to get the new lexicon
    Repeat process until there is performance improvement.
    """
    oldAccuracy = 0
    oldAngel = None
    se = PerformanceTest(lexPath, docPath)
    while True:
        adjustments = defaultdict(list)
        newAngel = Angel(se.lexicon, smallReviews=True)
        expectedSentiment, predictedOverall = [], []
        se.ResetIterator()
        while True:
            try:
                sentences, expectedLabel, notCount, docId = se.NextElement()
                expectedSentiment.append(expectedLabel)
                predictedScore = newAngel.PredictReviewScore(
                    sentences, expectedLabel)
                predictedLabel = Sentiment.GetSentimentClass(predictedScore)
                predictedOverall.append(predictedLabel)
                if oldAngel is not None:
                    oldPredictedLabel = Sentiment.GetSentimentClass(
                        oldAngel.PredictReviewScore(sentences, expectedLabel))
                    if oldPredictedLabel != predictedLabel:
                        oldAngel.DumpDetails(sentences, expectedLabel)
                        newAngel.DumpDetails(sentences, expectedLabel)
                totalImpact, impactTable = newAngel.GetImpact(sentences)
                if totalImpact == 0:
                    continue
                totalAdjustment = expectedLabel * 10 - predictedScore
                for word, (wordScore, multiplier) in impactTable.iteritems():
                    if multiplier == 0:
                        continue
                    wordAdjustment = ((wordScore / totalImpact) *
                                      totalAdjustment) / multiplier
                    if wordAdjustment != 0:
                        adjustments[word].append(wordAdjustment)
            except StopIteration:
                break
        newAccuracy = util.accuracy(predictedOverall, expectedSentiment)
        print "Accuracy:", oldAccuracy, "--->", newAccuracy
        if newAccuracy <= oldAccuracy:
            break
        for word in adjustments:
            se.lexicon[word] = str(
                float(se.lexicon[word]) + numpy.mean(adjustments[word]))
        oldAngel = newAngel
        oldAccuracy = newAccuracy

    filename = "../files/lexicons/" + lexiconID + ".csv"
    handle = open(filename, 'wb')
    wr = csv.writer(handle)
    for key, value in sorted(oldAngel.lexicon.items()):
        row = [key, value]
        wr.writerow(row)
    handle.close()