Beispiel #1
0
def test1():
    tokenizer = RegexpTokenizer(r'\w+')
    spams = []
    hams = []
    load_data(hams, spams, 'res/test1.txt')
    spam_words = []
    ham_words = []

    for spam in spams:
        spam_words += tokenizer.tokenize(spam)

    for ham in hams:
        ham_words += tokenizer.tokenize(ham)

    naive_bayes = NaiveBayes()

    naive_bayes.load(ham_words, spam_words)

    test_spams = []
    test_hams = []

    load_data(test_hams, test_spams, 'res/test1_check.txt')

    spam_correct = 0
    spam_incorrect = 0

    for word in test_spams:
        result = naive_bayes.is_positive(tokenizer.tokenize(word))
        if result:
            spam_incorrect += 1
        else:
            spam_correct += 1

    print('spam:', 'correct', spam_correct, 'incorrect', spam_incorrect)
    print('spam:', (spam_correct / (spam_incorrect + spam_correct)) * 100, '%')

    ham_correct = 0
    ham_incorrect = 0

    for word in test_hams:
        result = naive_bayes.is_positive(tokenizer.tokenize(word))
        if result:
            ham_correct += 1
        else:
            ham_incorrect += 1

    print('ham:', 'correct', ham_correct, 'incorrect', ham_incorrect)
    print('ham:', (ham_correct / (ham_incorrect + ham_correct)) * 100, '%')
Beispiel #2
0
def test2(is_from_begginning=True, training_percent=70):
    tokenizer = RegexpTokenizer(r'\w+')

    data = get_data('res/SMSSpamCollection.txt')

    training_data_length = int((len(data) * training_percent) / 100)

    if is_from_begginning:
        training_data = data[:training_data_length]

        test_data_length = len(data) - training_data_length

        test_data = data[-test_data_length:]
    else:
        training_data = data[-training_data_length:]

        test_data_length = len(data) - training_data_length

        test_data = data[test_data_length:]

    training_hams = []
    training_spams = []

    divide_data(training_data, training_hams, training_spams)

    training_spam_words = []
    training_ham_words = []

    for ham in training_hams:
        training_ham_words += tokenizer.tokenize(ham)

    for spam in training_spams:
        training_spam_words += tokenizer.tokenize(spam)

    naive_bayes = NaiveBayes()

    naive_bayes.load(training_ham_words, training_spam_words)

    test_hams = []
    test_spams = []

    divide_data(test_data, test_hams, test_spams)

    spam_correct = 0
    spam_incorrect = 0

    for word in test_spams:
        result = naive_bayes.is_positive(tokenizer.tokenize(word))
        if result:
            spam_incorrect += 1
        else:
            spam_correct += 1

    print('spam:', 'correct', spam_correct, 'incorrect', spam_incorrect)
    print('spam:', (spam_correct / (spam_incorrect + spam_correct)) * 100, '%')

    ham_correct = 0
    ham_incorrect = 0

    for word in test_hams:
        result = naive_bayes.is_positive(tokenizer.tokenize(word))
        if result:
            ham_correct += 1
        else:
            ham_incorrect += 1

    print('ham:', 'correct', ham_correct, 'incorrect', ham_incorrect)
    print('ham:', (ham_correct / (ham_incorrect + ham_correct)) * 100, '%')
Beispiel #3
0
	def __init__(self, directory = 'data/corpus2', spam = 'spam', ham = 'ham', limit = 1000):
		self.spam_path = os.path.join(os.getcwd(), directory, spam)
		self.ham_path = os.path.join(os.getcwd(), directory, ham)
		self.limit = limit
		self.classifier = NaiveBayes()
Beispiel #4
0
class Trainer(object):

	# Initializes the object
	# @param self Trainer object
	# @param directory The directory that contains the training folders
	# @param spam The sub directory for the spam class
	# @param ham The sub directory for the ham class.
	# @param limit The number of emails to be scanned
	def __init__(self, directory = 'data/corpus2', spam = 'spam', ham = 'ham', limit = 1000):
		self.spam_path = os.path.join(os.getcwd(), directory, spam)
		self.ham_path = os.path.join(os.getcwd(), directory, ham)
		self.limit = limit
		self.classifier = NaiveBayes()

	# A wrapper for the  train_classifier function.
	# @param self The trainer object
	# @param verbose Depending on verbosity information will be printed
	# @return The classifier object
	def train(self, verbose = False):
		self.train_classifier(self.spam_path,'spam', verbose)
		self.train_classifier(self.ham_path,'ham', verbose)

		return self.classifier

	# Converts a document into tokens and extracts features as mentioned in README.md
	# @param self The Trainer object
	# @param text The text to be scanned
	def extract_features(self, text):
		features  = []
		tokens  = text.split()
		porter = stem.porter.PorterStemmer()
		tokens = [token for token in tokens if token not in stopwords.words('english')]
		link = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
		for token in tokens:
			if len(token.translate(None,string.punctuation)) < 3:
				continue
			if "." + token in mimetypes.types_map.keys():
				features.append('ATTACHMENT')
			elif token.isdigit():
				features.append('NUMBER')
			elif token.upper() == token:
				features.append('ALL_CAPS')
				features.append(porter.stem(token.translate(None, string.punctuation)).lower())
			elif link.match(token):
				features.append('LINK')
			else:
				features.append(porter.stem(token.translate(None, string.punctuation)).lower())

		return features


	# The function that does the actual classfication
	# @param path The path of the data to be trained
	# @param label The label underwhich the data is classified
	# @param verbose the verbsoity of statistics printed
	def train_classifier(self, path, label, verbose):

		limit = len(os.listdir(path)) < self.limit and len(os.listdir(path)) or self.limit
		if verbose:
			print colored("Training %d emails in %s class" %(limit, label),'green')

		os.chdir(path)
		for email in os.listdir(path)[:self.limit]:
			if verbose and verbose > 1:
				print colored("Working on file %s" %(email),'green')
			email_file = open(email, 'r')
			email_text = email_file.read()
			try:
				email_text = bs4.UnicodeDammit.detwingle(email_text).decode('utf-8')
			except:
				print colored("Skipping file %s because of bad coding"%(email),'red')
				continue
			email_file.close()
			email_text = email_text.encode('ascii', 'ignore')
			features = self.extract_features(email_text)
			self.classifier.train(features, label)
		print colored(self.classifier,'green')
from classifier import NaiveBayes
from util import load_data

tokenizer = RegexpTokenizer(r'\w+')

spams = []
hams = []
load_data(hams, spams, 'res/SMSSpamCollection.txt')
spam_words = []
ham_words = []

for spam in spams:
    spam_words += tokenizer.tokenize(spam)

for ham in hams:
    ham_words += tokenizer.tokenize(ham)

naive_bayes = NaiveBayes()

naive_bayes.load(ham_words, spam_words)

message = ""

while message != "stop":
    message = input("Enter your SMS:")
    if naive_bayes.is_positive(tokenizer.tokenize(message)):
        print("ham")
    else:
        print("spam")
    folds = np.array_split(data, 10)  #make 10 folds in the dataset
    test_set = 0  #define test_set
    #for each folds treat one fold as test set and 9 fols at train set
    for y in range(len(folds)):
        X_train = pd.DataFrame()
        #if not test-set append fold in the train set
        for x in range(len(folds)):
            if x == test_set:
                y_test = folds[x]['class'].values
                X_test = folds[x].drop(['class'], axis=1)
            else:
                X_train = X_train.append(folds[x])

        y_train = X_train['class'].values
        X_train = X_train.drop(['class'], axis=1)
        nb = NaiveBayes()  #initialize Naive Bayes Classifier
        nb.fit(X_train, y_train)  #train model with train data
        y_pred = nb.predict(X_test)  #test model with test set
        #find error with respect to zero-one loss function
        error = nb.zero_one_loss_function(y_test, y_pred)
        printstr = "\nAccuracy of 0-1 loss for fold {0} ::: {1}".format(
            y, (1 - error))
        print_both(file, printstr)
        accuracy_list.append((1 - error))
        #get mean square error
        acc, precision, recall = nb.confusion_matrix(y_test, y_pred)
        printstr = "\nCF for fold {0} ::: acc:: {1} :: precision:: {2} :: recall :: {3}".format(
            y, acc, precision, recall)
        print_both(file, printstr)
        CF_accuracy_list.append(acc)
        CF_precision_list.append(precision)