Example #1
0
def testClassifier(classifier):
    testEmails = ['last questions',
                  'contact tyrion lannister in corporate for bankruptcy questions',
                  'family - they hope for some money from you',
                  'URL=mailto:[email protected] as the last contact',
                  'winterfell is in westeros or essos?']

    for testEmail in testEmails:
        processedEmail = nltkpreprocessor.processEmail(testEmail)
        featureVector = getFeatureVector(processedEmail)
        features = getFeatures(featureVector)
        emailSentiment = classifier.classify(features)

        trueFeatures = []
        for key, value in features.iteritems():
            if value is True:
                trueFeatures.append(key)

        print "     testEmail: {}".format(testEmail)
        print "processedEmail: {}".format(processedEmail)
        print " featureVector: {}".format(featureVector)
        print "  trueFeatures: {}".format(trueFeatures)
        print "emailSentiment: {}".format(emailSentiment)
        print "\n"

    classifier.show_most_informative_features(10)
Example #2
0
def processRawEmails(inputFile, outputFile):
    fr = open(inputFile, 'rU')
    fw = open(outputFile, 'w')

    featureList = []

    # read the emails and process one by one
    line = fr.readline()
    while line:
        processedEmail = nltkpreprocessor.processEmail(line)
        featureVector = getFeatureVector(processedEmail)
        featureList.extend(featureVector)
        line = fr.readline()

    # write feature list to output file
    featureList = list(set(featureList))
    for feature in featureList:
        fw.write("{}\n".format(feature))

    # close file handles
    fr.close()
    fw.close()
Example #3
0
def processLabeledEmails(inputFile):
    global featureList
    emailSentiments = []

    fr = open(inputFile, 'rU')
    line = fr.readline()
    while line:
        emailItems = line.split('\t')
        emailLine = emailItems[0]
        sentiment = emailItems[1].rstrip()

        processedEmail = nltkpreprocessor.processEmail(emailLine)
        featureVector = getFeatureVector(processedEmail)
        emailSentiments.append((featureVector, sentiment))
        featureList.extend(featureVector)

        line = fr.readline()

    # close file handle
    fr.close()

    # remove dupes from featureList
    featureList = list(set(featureList))

    # generate training set
    emailTrainingSet = nltk.classify.util.apply_features(getFeatures, emailSentiments)

    # train the classifier
    classifier = nltk.NaiveBayesClassifier.train(emailTrainingSet)

    # save the classifier
    classifierPickle = open('email_classifier.pickle', 'wb')
    pickle.dump(classifier, classifierPickle)
    classifierPickle.close()

    return classifier