def main():
    data = []
    for verdict in ['spam', 'not_spam']:
        for files in glob.glob(PATH + verdict + "/*")[:500]:
            is_spam = True if verdict == 'spam' else False
            with open(files, "r", encoding='utf-8', errors='ignore') as f:
                for line in f:
                    if line.startswith("Subject:"):
                        subject = re.sub("^Subject: ", "", line).strip()
                        data.append((subject, is_spam))

    random.seed(0)
    train_data, test_data = split_data(data, 0.75)
    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    print("Spam" if classifier.classify("Get free laptops now!") > 0.5 else
          "Not Spam")

    classified = [(subject, is_spam, classifier.classify(subject))
                  for subject, is_spam in test_data]

    count = Counter((is_spam, spam_probability > 0.5)
                    for _, is_spam, spam_probability in classified)

    spammiest_hams, hammiest_spams = most_misclassified(classified)

    print("Accuracy: ", accuracy(count))
    print("Precision: ", precision(count))
    print("Recall: ", recall(count))
    print("\nTop 5 falsely classified as spam:\n\n", spammiest_hams)
    print("\nTop 5 falsely classified as not spam:\n\n", hammiest_spams)
    print("\nMost spammiest words: ", spammiest_word(classifier))
Ejemplo n.º 2
0
class TestClassifier(unittest.TestCase):

    def setUp(self):
        self.examples = {'university': ['''Abbottabad Public School , also commonly referred to as
        APS and Railway Public School , is a private , all boys , boarding
        school for , 7th to 12th grade students , located in Abbottabad ,
        Pakistan .''']}
        self.classifier = NaiveBayesClassifier(self.examples)

    def test_create_vocabulary(self):
        self.classifier.vocabulary.should.contain('private')

    def test_vocabulary_size(self):
        self.classifier.vocabulary_size.should.eql(28)

    def test_subset_of_documents_with_target_value(self):
        len(self.classifier.get_documents_with_target_value('university')).should.eql(1)

    def test_text_of_documents(self):
        documents = self.classifier.get_documents_with_target_value('university')
        self.classifier.get_text(documents).should.contain('private')

    def test_text_distinct_words(self):
        documents = self.classifier.get_documents_with_target_value('university')
        text = self.classifier.get_text(documents)
        self.classifier.get_text_diff_words_count(text).should.eql(28)

    def test_example_count(self):
        self.classifier.get_example_count().should.eql(1)

    def test_occurrences_of_word_count(self):
        documents = self.classifier.get_documents_with_target_value('university')
        text = self.classifier.get_text(documents)
        self.classifier.occurrences_count(',', text).should.eql(7)

    def test_learn(self):
        self.classifier.learn()

    def test_word_positions_in_doc(self):
        documents = self.classifier.get_documents_with_target_value('university')
        len(self.classifier.word_positions(documents[0])).should.eql(38)

    def test_classify(self):
        self.classifier.learn()
        self.classifier.classify(self.examples['university'][0]).should.eql('university')
Ejemplo n.º 3
0
def spamFilterChecker():
    print('Received the JAVA Request!')
    # Get the text data from the JAVA Program.
    req_data = abhishek_request.get_json()
    text_to_be_classified = req_data['text_to_be_classified']
    print(text_to_be_classified)

    # ----------------------------------------------------------------------------
    # Make a POST request to the plino Spam API.
    # ----------------------------------------------------------------------------
    data = []
    for verdict in ['spam', 'not_spam']:
        for files in glob.glob(PATH + verdict + "/*")[:500]:
            is_spam = True if verdict == 'spam' else False
            with open(files, "r", encoding='utf-8', errors='ignore') as f:
                for line in f:
                    if line.startswith("Subject:"):
                        subject = re.sub("^Subject: ", "", line).strip()
                        data.append((subject, is_spam))

    random.seed(0)
    train_data, test_data = split_data(data, 0.80)
    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    json_response = ""
    value = classifier.classify(text_to_be_classified)
    if value < 0.9:
        json_response = "{'email_class' : 'spam'}"
    else:
        json_response = "{'email_class' : 'ham'}"
    print("====================================================")
    print("POSSIBILITY OF HAM : ", value)
    print(json_response)
    print("====================================================")
    return json_response
Ejemplo n.º 4
0
	dir = os.path.realpath('..')
	keyword = 'hillary'
	trainingDataFile = '/home/cc/twitterSentiment/src/input/hillary.txt'
	inpfile = open(trainingDataFile, "r")
	lines = inpfile.read().split()
	tweets = []
	for tweet in tweets:
		tweets.append(tweet)
	time = 'daily'
	classifierDumpFile = '/home/cc/twitterSentiment/src//input/naivebayes_model.pickle'
	trainingRequired = 0
	# instantiate the instance of classifier class
	nb = NaiveBayesClassifier(tweets, keyword, time, \
								  trainingDataFile, classifierDumpFile, trainingRequired)
	# run the classifier model on tweets
	nb.classify()
	htmlcode = nb.getHTML()
	htmlfile = open('/var/www/html/index.html','w')
	htmlfile.write(htmlcode)
	htmlfile.close()
#	time = 'lastweek'
#	twitterData = get_twitter_data.TwitterData()
#	tweets = twitterData.getTwitterData(keyword, time)

if not os.path.exists('./input/trump.txt'):
	keyword = 'trump'
#	time = 'lastweek'
#	twitterData = get_twitter_data.TwitterData()
#	tweets = twitterData.getTwitterData(keyword, time)

    with open(fn, "r") as file:
        for line in file:
            if line.startswith("Subject:"):
                # remove the leading "Subject: " and keep what's left
                subject = re.sub(r"^Subject: ", "", line).strip()
                data.append((subject, is_spam))

random.seed(0)
train_data, test_data = split_data(data, 0.75)

classifier = NaiveBayesClassifier()
classifier.train(train_data)

# triplets (subject, actual is_spam, predicted spam probability)
classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data]

# assume that spam_probability > 0.5 corresponds to spam prediction
# and count the combinations of (actual is_spam, predicted is_spam)
counts = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified)

# sort by spam_probability from smallest to largest
classified.sort(key=lambda row: row[2])

# the highest predicted spam probabilities among the non-spams
spammiest_hams = filter(lambda row: not row[1], classified)[-5:]

# the lowest predicted spam probabilities among the actual spams
hammiest_spams = filter(lambda row: row[1], classified)[:5]

print("Testing Probabilities")
assert naive_bayes.probability('scam', True) == 0.4

assert naive_bayes.probability('scam', False) == 0.6
print("passed")

print("Testing Conditional Probabilities")
assert naive_bayes.conditional_probability(('errors',True), given=('scam',True)) == 1.0

assert naive_bayes.conditional_probability(('links',False), given=('scam',True)) == 0.25

assert naive_bayes.conditional_probability(('errors',True), given=('scam',False)) == 0.16666666666666666

assert naive_bayes.conditional_probability(('links',False), given=('scam',False)) == 0.5
print("passed")

observed_features = {
    'errors': True,
    'links': False
}

print("Testing Likeihoods")
assert naive_bayes.likelihood(('scam',True), observed_features) == 0.1
assert round(naive_bayes.likelihood(('scam',False), observed_features),3) == 0.05
print("passed")

print("Testing Classification")
assert naive_bayes.classify(observed_features) == True
print('passed')

print('ALL TESTS PASSED')
Ejemplo n.º 7
0
    with open(fn, 'r') as file:
        for line in file:
            if line.startswith("Subject:"):
                # remove the leading "Subject: " and keep what's left
                subject = re.sub(r"^Subject: ", "", line).strip()
                data.append((subject, is_spam))

random.seed(0)
train_data, test_data = split_data(data, 0.75)

classifier = NaiveBayesClassifier()
classifier.train(train_data)

# triplets (subject, actual is_spam, predicted spam probability)
classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

# assume that spam_probability > 0.5 corresponds to spam prediction
# and count the combinations of (actual is_spam, predicted is_spam)
counts = Counter((is_spam, spam_probability > 0.5)
                 for _, is_spam, spam_probability in classified)

# sort by spam_probability from smallest to largest
classified.sort(key=lambda row: row[2])

# the highest predicted spam probabilities among the non-spams
spammiest_hams = filter(lambda row: not row[1], classified)[-5:]

# the lowest predicted spam probabilities among the actual spams
hammiest_spams = filter(lambda row: row[1], classified)[:5]