def train_and_test_model(path):
    data = get_subject_data(path)
    random.seed(0)  # just so you get the same answers as me
    train_data, test_data = split_data(data, 0.7)

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
                  for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > 0.7)  # (actual, predicted)
                     for _, is_spam, spam_probability in classified)

    print(counts)

    classified.sort(key=lambda row: row[2])
    spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
    hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

    print("spammiest_hams", spammiest_hams)
    print("hammiest_spams", hammiest_spams)

    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print("spammiest_words", spammiest_words)
    print("hammiest_words", hammiest_words)
Esempio n. 2
0
def train_and_test_model(path):
    random.seed(0)      # just so you get the same answers as me
    data = get_subject_data(path)
    print(f'data: {len(data)}')

    train_data, test_data = split_data(data, 0.75)

    print(f'Train data size: {len(train_data)}')
    print(f'Test data size: {len(test_data)}')

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > .8) # (actual, predicted)
                     for _, is_spam, spam_probability in classified)

    print(counts)

    classified.sort(key=lambda row: row[2])
    spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
    hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

    print("\nspammiest_hams", spammiest_hams)
    print("\nhammiest_spams", hammiest_spams)

    spammiest_words = classifier.word_probs.sort_values(by='prob_is_spam').tail().index.to_list()
    hammiest_words = classifier.word_probs.sort_values(by='prob_not_spam').tail().index.to_list()

    print("\nspammiest_words", spammiest_words)
    print("\nhammiest_words", hammiest_words)
Esempio n. 3
0
def train_and_test_model(path):
    data = get_subject_data(path)
    random.seed(0)
    train_data, test_data = split_data(data, .75)

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
                  for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > .5)
                     for _, is_spam, spam_probability in classified)

    print counts

    classified.sort(key=lambda row: row[2])
    spammiest_hams = filter(lambda row: not row[1], classified)[-5:]
    hammiest_spams = filter(lambda row: row[1], classified)[:5]

    print "spammiest_hams", spammiest_hams
    print "hammiest_spams", hammiest_spams

    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print "spammiest_words", spammiest_words
    print "hammiest_words", hammiest_words
def train_and_test_model(path):

    data = get_subject_data(path)
    random.seed(0)      # just so you get the same answers as me
    train_data, test_data = split_data(data, 0.75)    

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
                     for _, is_spam, spam_probability in classified)

    print counts

    classified.sort(key=lambda row: row[2])
    spammiest_hams = filter(lambda row: not row[1], classified)[-5:]
    hammiest_spams = filter(lambda row: row[1], classified)[:5]

    print "spammiest_hams", spammiest_hams
    print "hammiest_spams", hammiest_spams

    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print "spammiest_words", spammiest_words
    print "hammiest_words", hammiest_words
Esempio n. 5
0
def main():
    path = "resources/spam_data/*/*"

    data: List[Message] = []

    for filename in glob.glob(path):
        is_spam = "ham" not in filename
        with open(filename, errors='ignore') as email_file:
            for line in email_file:
                if line.startswith("Subject: "):
                    subject = line.lstrip("Subject: ")
                    data.append(Message(subject, is_spam))
                    break
    random.seed(0)
    train_messages, test_messages = split_data(data, 0.75)

    model = NaiveBayesClassifier()
    model.train(train_messages)

    predictions = [(message, model.predict(message.text))
                   for message in test_messages]
    confusion_matrix = Counter((message.is_spam, spam_probability > 0.5)
                               for message, spam_probability in predictions)
    print(confusion_matrix)

    def p_spam_given_token(token: str, model: NaiveBayesClassifier) -> float:
        prob_if_spam, prob_if_ham = model._probabilities(token)
        return prob_if_spam / (prob_if_spam + prob_if_ham)

    words = sorted(model.tokens, key=lambda t: p_spam_given_token(t, model))

    print("spamiest_words", words[-10:])
    print("hamiest_words", words[:10])
def train_and_test_model(path):

    data = get_subject_data(path)
    random.seed(0)      # just so you get the same answers as me
    train_data, test_data = split_data(data, 0.75) #splits data, 25% training, 75% test (used on both solo_artist and band)
    #print train_data
    #print test_data
    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(tweets, is_solo_artist, classifier.classify(tweets))
              for tweets, is_solo_artist in test_data]

    counts = Counter((is_solo_artist, solo_artist_probability > 0.5) # (actual, predicted)
                     for _, is_solo_artist, solo_artist_probability in classified)
    #Prints stats...
    print counts


    words = sorted(classifier.word_probs, key=p_solo_artist_given_word)

    solo_artist_words = words[-5:]
    band_words = words[:5]

    print "most_probable_solo_artist_words", solo_artist_words
    print "most_probable_band_words", band_words
Esempio n. 7
0
def train_and_test_model(path):

    data = get_subject_data(path)
    random.seed(0)      # just so you get the same answers as me
    train_data, test_data = split_data(data, 0.75)    

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
                     for _, is_spam, spam_probability in classified)


    print "----------test begin part 1 is spam-------------"
    msg = "i like to play basketball"
    print msg
    print tokenize(msg)
    a = classifier.classify(msg)
    print a
    print "-----------test end----------"

    print "----------test begin part 2 is091609151152s1rssr spam-------------"
    msg_two = "software"
    print msg_two
    print tokenize(msg_two)
    b = classifier.classify(msg_two)
    print b
    print "-----------test end----------"

    print counts

    classified.sort(key=lambda row: row[2])
    spammiest_hams = filter(lambda row: not row[1], classified)[-5:]
    hammiest_spams = filter(lambda row: row[1], classified)[:5]

    print "spammiest_hams", spammiest_hams
    print "hammiest_spams", hammiest_spams

    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print "spammiest_words", spammiest_words
    print "hammiest_words", hammiest_words
Esempio n. 8
0
def train_and_test_model(path):
    data = get_subject_data(path)
    random.seed(0)  # just so you get the same answers as me
    train_data, test_data = split_data(data, 0.75)

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
                  for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > 0.5)  # (actual, predicted)
                     for _, is_spam, spam_probability in classified)

    print counts

    print("---20150518 ex---")
    msg = "I like to play computer."
    msg2 = "I don't know how to do it."

    print("---test begin---")
    print(msg)
    print(tokenize(msg))
    a = classifier.classify(msg)
    print (a)

    print(msg2)
    print(tokenize(msg2))
    b = classifier.classify(msg2)
    print (b)
    print("---test end----")
    print("---20150518 ex---\n")

    classified.sort(key=lambda row: row[2])
    spammiest_hams = filter(lambda row: not row[1], classified)[-5:]
    hammiest_spams = filter(lambda row: row[1], classified)[:5]

    print "spammiest_hams", spammiest_hams
    print "hammiest_spams", hammiest_spams

    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print "spammiest_words", spammiest_words
    print "hammiest_words", hammiest_words
def train_and_test_model2(path):
    data = get_subject_data(path)
    random.seed(0)  # just so you get the same answers as me
    train_data, test_data = split_data(data, 0.75)

    nbc = NaiveBayesClassifier()
    nbc.train(train_data)

    classified = [(subject, is_spam, nbc.classify(subject))
                  for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > 0.5)  # (actual, predicted)
                     for _, is_spam, spam_probability in classified)

    print(counts)

    return np.array([spam_probability > 0.5 for _, _, spam_probability in classified]), \
           np.array([prob for _, _, prob in classified])
Esempio n. 10
0
def train_and_test_model(path):

    data = get_subject_data(path)
    random.seed(0)  # just so you get the same answers as me
    train_data, test_data = split_data(data, 0.75)

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
                  for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > 0.5)  # (actual, predicted)
                     for _, is_spam, spam_probability in classified)

    print("-------test1 start------")
    msg1 = "What the f**k"
    print(tokenize(msg1))
    print(classifier.classify(msg1))
    print("-------test1 end------\n")

    print("-------test2 start------")
    msg2 = "Hello World"
    print(tokenize(msg2))
    print(classifier.classify(msg2))
    print("-------test2 end------\n")

    print counts

    classified.sort(key=lambda row: row[2])
    spammiest_hams = filter(lambda row: not row[1], classified)[-5:]
    hammiest_spams = filter(lambda row: row[1], classified)[:5]

    print "spammiest_hams", spammiest_hams
    print "hammiest_spams", hammiest_spams

    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print "spammiest_words", spammiest_words
    print "hammiest_words", hammiest_words
Esempio n. 11
0
def train_and_test_model(path):

    data = get_subject_data(path)
    random.seed(0)  # just so you get the same answers as me
    train_data, test_data = split_data(data, 0.75)

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
                  for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > 0.5)  # (actual, predicted)
                     for _, is_spam, spam_probability in classified)
    msg = "I Love you so much ha ha."
    msg2 = "What the F**k"
    print("----test bigin----")
    print(msg)
    print(tokenize(msg))
    a = classifier.classify(msg)
    print(a)
    print("----2----")
    print(msg2)
    print(tokenize(msg2))
    b = classifier.classify(msg2)
    print(b)
    print("----test END----")
    print counts

    classified.sort(key=lambda row: row[2])
    spammiest_hams = filter(lambda row: not row[1], classified)[-5:]
    hammiest_spams = filter(lambda row: row[1], classified)[:5]

    print "spammiest_hams", spammiest_hams
    print "hammiest_spams", hammiest_spams

    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print "spammiest_words", spammiest_words
    print "hammiest_words", hammiest_words
Esempio n. 12
0
def train_and_test_model(path, settings_options: dict = None):
    if settings_options:
        for key, value in settings_options.items():
            setattr(settings, key, value)
    data = get_subject_data(path)
    random.seed(0)  # just so you get the same answers as me
    train_data, test_data = split_data(
        data, 0.83
    )  # Change: Use 0.83 percentage to split between train and test data

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
                  for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_prob > 0.5)  # (actual, predicted)
                     for _, is_spam, spam_prob in classified)

    # Change: Adds accuracy as metric to compare results
    hits, misses = counts[(True, True)] + counts[(False, False)], counts[
        (True, False)] + counts[(False, True)]
    accuracy = hits / len(classified)
    print("Accuracy:", accuracy, "Counts:", counts, "Settings:",
          settings_options)

    # classified.sort(key=lambda row: row[2])
    # spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
    # hammiest_spams = list(filter(lambda row: row[1], classified))[:5]
    #
    # print("spammiest_hams", spammiest_hams)
    # print("hammiest_spams", hammiest_spams)
    #
    # words = sorted(classifier.word_probs, key=p_spam_given_word)
    #
    # spammiest_words = words[-5:]
    # hammiest_words = words[:5]
    #
    # print("spammiest_words", spammiest_words)
    # print("hammiest_words", hammiest_words)
    return accuracy
Esempio n. 13
0
# glob.glob returns every file name that matches the wildcard path
for fn in glob.glob(path):
    is_spam = "ham" not in fn

    with open(fn, 'r', encoding='ISO-8859-1') as file:
        for line in file:
            if line.startswith("Subject:"):
                # remove the leading "Subject: " and keep what's left
                subject = subject_regex.sub("", line).strip()
                data.append((subject, is_spam))

random.seed(0)


train_data, test_data = split_data(data, 0.75)

classifier = NaiveBayesClassifer()
classifier.train(train_data)

# triplets (subject, actual is_spam, predicted spam probability)
classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

# assume that spam_probability > 0.5 corresponds to spam prediction
# and count the combinations of (actual is_spam, predicted is_spam)
counts = Counter((is_spam, spam_porbability > 0.5)
                 for _, is_spam, spam_porbability in classified)

print(counts)
 def test_split_data(self):
     with mock.patch.object(random, 'random', side_effect=[r/10 for r in range(7)]):
         self.assertEqual(([0, 1, 2, 3], [4, 5, 6]), machine_learning.split_data(range(7), 0.4))
 def test_split_data(self):
     with mock.patch.object(random,
                            'random',
                            side_effect=[r / 10 for r in range(7)]):
         self.assertEqual(([0, 1, 2, 3], [4, 5, 6]),
                          machine_learning.split_data(range(7), 0.4))
Esempio n. 16
0
					# Subject 부분을 제거하고 나머지 부분을 유지
					subject = re.sub(r"^Subject: ", "", line).strip()
					data.append((subject, is_spam))
	return data


def p_spam_given_word(word_prob):
	"""베이즈 정리를 통해 p(스팸 \ 메시지가 해당 단어를 포함) 을 계산"""
	word, prob_if_spam, prob_if_not_spam = word_prob
	return prob_if_spam / (prob_if_spam + prob_if_not_spam)


if __name__ == "__main__":
	data = get_subject_data()
	random.seed(0)  # 예시와 동일한 결과를 얻기위해 설정
	train_data, test_data = split_data(data, 0.75)

	classifier = NaiveBayesClassfier()
	classifier.train(train_data)

	classified = [(subject, is_spam, classifier.classify(subject))
				  for subject, is_spam in test_data]

	counts = Counter((is_spam, spam_probability > 0.5)  # (actual, predicted)
					 for _, is_spam, spam_probability in classified)
	print(counts)

	# 스팸일 확률을 오름차순으로 정렬
	classified.sort(key=lambda row: row[2])

	# 스팸이 아닌 메시지 중에서 스팸일 확률이 가장 높은 메시지
Esempio n. 17
0
    is_spam = "ham" not in filename

    # There are some garbage characters in the emails; the errors = 'ignore'
    # skips them instead of raising an exception
    with open(filename, errors='ignore') as email_file:
        for line in email_file:
            if line.startswith("Subject:"):
                subject = line.lstrip("Subject: ")
                data.append(Message(subject, is_spam))
                break  # done with this file

import random
from machine_learning import split_data

random.seed(0)
train_messages, test_messages = split_data(data, 0.75)

model = NaiveBayesClassifier()
model.train(train_messages)

from collections import Counter

predictions = [(message, model.predict(message.text))
               for message in test_messages]

confusion_matrix = Counter((message.is_spam, spam_probability > 0.5)
                           for message, spam_probability in predictions)
print(confusion_matrix)


def p_spam_given_token(token: str, model: NaiveBayesClassifier) -> float:
def get_train_test_data(path):
    data = get_subject_data(path)
    random.seed(0)  # just so you get the same answers as me
    train_data, test_data = split_data(data, 0.75)
    return train_data, test_data
Esempio n. 19
0
        ax[row][col].set_xticks([])
        ax[row][col].set_xticks([])

        for mark, (species, points) in zip(marks, points_by_species.items()):
            xs = [point[i] for point in points]
            ys = [point[j] for point in points]
            ax[row][col].scatter(xs, ys, marker=mark, label=species)

ax[-1][-1].legend(loc='lower right', prop={'size': 6})
plt.show()

import random
from machine_learning import split_data
import math
random.seed(12)
iris_train, iris_test = split_data(iris_data, 0.70)

assert len(iris_train) == math.floor(0.7 * len(iris_data))
assert len(iris_test) == math.ceil(0.3 * len(iris_data))

from typing import Tuple

# track how many times we see (predicted, actual)

confusion_matrix: Dict[Tuple[str, str], int] = defaultdict(int)
num_correct = 0
k = 5
for iris in iris_test:
    predicted = knn_classify(k, iris_train, iris.point)
    actual = iris.label