Beispiel #1
0
def test1():
    tokenizer = RegexpTokenizer(r'\w+')
    spams = []
    hams = []
    load_data(hams, spams, 'res/test1.txt')
    spam_words = []
    ham_words = []

    for spam in spams:
        spam_words += tokenizer.tokenize(spam)

    for ham in hams:
        ham_words += tokenizer.tokenize(ham)

    naive_bayes = NaiveBayes()

    naive_bayes.load(ham_words, spam_words)

    test_spams = []
    test_hams = []

    load_data(test_hams, test_spams, 'res/test1_check.txt')

    spam_correct = 0
    spam_incorrect = 0

    for word in test_spams:
        result = naive_bayes.is_positive(tokenizer.tokenize(word))
        if result:
            spam_incorrect += 1
        else:
            spam_correct += 1

    print('spam:', 'correct', spam_correct, 'incorrect', spam_incorrect)
    print('spam:', (spam_correct / (spam_incorrect + spam_correct)) * 100, '%')

    ham_correct = 0
    ham_incorrect = 0

    for word in test_hams:
        result = naive_bayes.is_positive(tokenizer.tokenize(word))
        if result:
            ham_correct += 1
        else:
            ham_incorrect += 1

    print('ham:', 'correct', ham_correct, 'incorrect', ham_incorrect)
    print('ham:', (ham_correct / (ham_incorrect + ham_correct)) * 100, '%')
from classifier import NaiveBayes
from util import load_data

tokenizer = RegexpTokenizer(r'\w+')

spams = []
hams = []
load_data(hams, spams, 'res/SMSSpamCollection.txt')
spam_words = []
ham_words = []

for spam in spams:
    spam_words += tokenizer.tokenize(spam)

for ham in hams:
    ham_words += tokenizer.tokenize(ham)

naive_bayes = NaiveBayes()

naive_bayes.load(ham_words, spam_words)

message = ""

while message != "stop":
    message = input("Enter your SMS:")
    if naive_bayes.is_positive(tokenizer.tokenize(message)):
        print("ham")
    else:
        print("spam")
Beispiel #3
0
def test2(is_from_begginning=True, training_percent=70):
    tokenizer = RegexpTokenizer(r'\w+')

    data = get_data('res/SMSSpamCollection.txt')

    training_data_length = int((len(data) * training_percent) / 100)

    if is_from_begginning:
        training_data = data[:training_data_length]

        test_data_length = len(data) - training_data_length

        test_data = data[-test_data_length:]
    else:
        training_data = data[-training_data_length:]

        test_data_length = len(data) - training_data_length

        test_data = data[test_data_length:]

    training_hams = []
    training_spams = []

    divide_data(training_data, training_hams, training_spams)

    training_spam_words = []
    training_ham_words = []

    for ham in training_hams:
        training_ham_words += tokenizer.tokenize(ham)

    for spam in training_spams:
        training_spam_words += tokenizer.tokenize(spam)

    naive_bayes = NaiveBayes()

    naive_bayes.load(training_ham_words, training_spam_words)

    test_hams = []
    test_spams = []

    divide_data(test_data, test_hams, test_spams)

    spam_correct = 0
    spam_incorrect = 0

    for word in test_spams:
        result = naive_bayes.is_positive(tokenizer.tokenize(word))
        if result:
            spam_incorrect += 1
        else:
            spam_correct += 1

    print('spam:', 'correct', spam_correct, 'incorrect', spam_incorrect)
    print('spam:', (spam_correct / (spam_incorrect + spam_correct)) * 100, '%')

    ham_correct = 0
    ham_incorrect = 0

    for word in test_hams:
        result = naive_bayes.is_positive(tokenizer.tokenize(word))
        if result:
            ham_correct += 1
        else:
            ham_incorrect += 1

    print('ham:', 'correct', ham_correct, 'incorrect', ham_incorrect)
    print('ham:', (ham_correct / (ham_incorrect + ham_correct)) * 100, '%')
    folds = np.array_split(data, 10)  #make 10 folds in the dataset
    test_set = 0  #define test_set
    #for each folds treat one fold as test set and 9 fols at train set
    for y in range(len(folds)):
        X_train = pd.DataFrame()
        #if not test-set append fold in the train set
        for x in range(len(folds)):
            if x == test_set:
                y_test = folds[x]['class'].values
                X_test = folds[x].drop(['class'], axis=1)
            else:
                X_train = X_train.append(folds[x])

        y_train = X_train['class'].values
        X_train = X_train.drop(['class'], axis=1)
        nb = NaiveBayes()  #initialize Naive Bayes Classifier
        nb.fit(X_train, y_train)  #train model with train data
        y_pred = nb.predict(X_test)  #test model with test set
        #find error with respect to zero-one loss function
        error = nb.zero_one_loss_function(y_test, y_pred)
        printstr = "\nAccuracy of 0-1 loss for fold {0} ::: {1}".format(
            y, (1 - error))
        print_both(file, printstr)
        accuracy_list.append((1 - error))
        #get mean square error
        acc, precision, recall = nb.confusion_matrix(y_test, y_pred)
        printstr = "\nCF for fold {0} ::: acc:: {1} :: precision:: {2} :: recall :: {3}".format(
            y, acc, precision, recall)
        print_both(file, printstr)
        CF_accuracy_list.append(acc)
        CF_precision_list.append(precision)