def testClassifier(classifier): testEmails = ['last questions', 'contact tyrion lannister in corporate for bankruptcy questions', 'family - they hope for some money from you', 'URL=mailto:[email protected] as the last contact', 'winterfell is in westeros or essos?'] for testEmail in testEmails: processedEmail = nltkpreprocessor.processEmail(testEmail) featureVector = getFeatureVector(processedEmail) features = getFeatures(featureVector) emailSentiment = classifier.classify(features) trueFeatures = [] for key, value in features.iteritems(): if value is True: trueFeatures.append(key) print " testEmail: {}".format(testEmail) print "processedEmail: {}".format(processedEmail) print " featureVector: {}".format(featureVector) print " trueFeatures: {}".format(trueFeatures) print "emailSentiment: {}".format(emailSentiment) print "\n" classifier.show_most_informative_features(10)
def processRawEmails(inputFile, outputFile): fr = open(inputFile, 'rU') fw = open(outputFile, 'w') featureList = [] # read the emails and process one by one line = fr.readline() while line: processedEmail = nltkpreprocessor.processEmail(line) featureVector = getFeatureVector(processedEmail) featureList.extend(featureVector) line = fr.readline() # write feature list to output file featureList = list(set(featureList)) for feature in featureList: fw.write("{}\n".format(feature)) # close file handles fr.close() fw.close()
def processLabeledEmails(inputFile): global featureList emailSentiments = [] fr = open(inputFile, 'rU') line = fr.readline() while line: emailItems = line.split('\t') emailLine = emailItems[0] sentiment = emailItems[1].rstrip() processedEmail = nltkpreprocessor.processEmail(emailLine) featureVector = getFeatureVector(processedEmail) emailSentiments.append((featureVector, sentiment)) featureList.extend(featureVector) line = fr.readline() # close file handle fr.close() # remove dupes from featureList featureList = list(set(featureList)) # generate training set emailTrainingSet = nltk.classify.util.apply_features(getFeatures, emailSentiments) # train the classifier classifier = nltk.NaiveBayesClassifier.train(emailTrainingSet) # save the classifier classifierPickle = open('email_classifier.pickle', 'wb') pickle.dump(classifier, classifierPickle) classifierPickle.close() return classifier