def test1(): tokenizer = RegexpTokenizer(r'\w+') spams = [] hams = [] load_data(hams, spams, 'res/test1.txt') spam_words = [] ham_words = [] for spam in spams: spam_words += tokenizer.tokenize(spam) for ham in hams: ham_words += tokenizer.tokenize(ham) naive_bayes = NaiveBayes() naive_bayes.load(ham_words, spam_words) test_spams = [] test_hams = [] load_data(test_hams, test_spams, 'res/test1_check.txt') spam_correct = 0 spam_incorrect = 0 for word in test_spams: result = naive_bayes.is_positive(tokenizer.tokenize(word)) if result: spam_incorrect += 1 else: spam_correct += 1 print('spam:', 'correct', spam_correct, 'incorrect', spam_incorrect) print('spam:', (spam_correct / (spam_incorrect + spam_correct)) * 100, '%') ham_correct = 0 ham_incorrect = 0 for word in test_hams: result = naive_bayes.is_positive(tokenizer.tokenize(word)) if result: ham_correct += 1 else: ham_incorrect += 1 print('ham:', 'correct', ham_correct, 'incorrect', ham_incorrect) print('ham:', (ham_correct / (ham_incorrect + ham_correct)) * 100, '%')
def test2(is_from_begginning=True, training_percent=70): tokenizer = RegexpTokenizer(r'\w+') data = get_data('res/SMSSpamCollection.txt') training_data_length = int((len(data) * training_percent) / 100) if is_from_begginning: training_data = data[:training_data_length] test_data_length = len(data) - training_data_length test_data = data[-test_data_length:] else: training_data = data[-training_data_length:] test_data_length = len(data) - training_data_length test_data = data[test_data_length:] training_hams = [] training_spams = [] divide_data(training_data, training_hams, training_spams) training_spam_words = [] training_ham_words = [] for ham in training_hams: training_ham_words += tokenizer.tokenize(ham) for spam in training_spams: training_spam_words += tokenizer.tokenize(spam) naive_bayes = NaiveBayes() naive_bayes.load(training_ham_words, training_spam_words) test_hams = [] test_spams = [] divide_data(test_data, test_hams, test_spams) spam_correct = 0 spam_incorrect = 0 for word in test_spams: result = naive_bayes.is_positive(tokenizer.tokenize(word)) if result: spam_incorrect += 1 else: spam_correct += 1 print('spam:', 'correct', spam_correct, 'incorrect', spam_incorrect) print('spam:', (spam_correct / (spam_incorrect + spam_correct)) * 100, '%') ham_correct = 0 ham_incorrect = 0 for word in test_hams: result = naive_bayes.is_positive(tokenizer.tokenize(word)) if result: ham_correct += 1 else: ham_incorrect += 1 print('ham:', 'correct', ham_correct, 'incorrect', ham_incorrect) print('ham:', (ham_correct / (ham_incorrect + ham_correct)) * 100, '%')
def __init__(self, directory = 'data/corpus2', spam = 'spam', ham = 'ham', limit = 1000): self.spam_path = os.path.join(os.getcwd(), directory, spam) self.ham_path = os.path.join(os.getcwd(), directory, ham) self.limit = limit self.classifier = NaiveBayes()
class Trainer(object): # Initializes the object # @param self Trainer object # @param directory The directory that contains the training folders # @param spam The sub directory for the spam class # @param ham The sub directory for the ham class. # @param limit The number of emails to be scanned def __init__(self, directory = 'data/corpus2', spam = 'spam', ham = 'ham', limit = 1000): self.spam_path = os.path.join(os.getcwd(), directory, spam) self.ham_path = os.path.join(os.getcwd(), directory, ham) self.limit = limit self.classifier = NaiveBayes() # A wrapper for the train_classifier function. # @param self The trainer object # @param verbose Depending on verbosity information will be printed # @return The classifier object def train(self, verbose = False): self.train_classifier(self.spam_path,'spam', verbose) self.train_classifier(self.ham_path,'ham', verbose) return self.classifier # Converts a document into tokens and extracts features as mentioned in README.md # @param self The Trainer object # @param text The text to be scanned def extract_features(self, text): features = [] tokens = text.split() porter = stem.porter.PorterStemmer() tokens = [token for token in tokens if token not in stopwords.words('english')] link = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') for token in tokens: if len(token.translate(None,string.punctuation)) < 3: continue if "." + token in mimetypes.types_map.keys(): features.append('ATTACHMENT') elif token.isdigit(): features.append('NUMBER') elif token.upper() == token: features.append('ALL_CAPS') features.append(porter.stem(token.translate(None, string.punctuation)).lower()) elif link.match(token): features.append('LINK') else: features.append(porter.stem(token.translate(None, string.punctuation)).lower()) return features # The function that does the actual classfication # @param path The path of the data to be trained # @param label The label underwhich the data is classified # @param verbose the verbsoity of statistics printed def train_classifier(self, path, label, verbose): limit = len(os.listdir(path)) < self.limit and len(os.listdir(path)) or self.limit if verbose: print colored("Training %d emails in %s class" %(limit, label),'green') os.chdir(path) for email in os.listdir(path)[:self.limit]: if verbose and verbose > 1: print colored("Working on file %s" %(email),'green') email_file = open(email, 'r') email_text = email_file.read() try: email_text = bs4.UnicodeDammit.detwingle(email_text).decode('utf-8') except: print colored("Skipping file %s because of bad coding"%(email),'red') continue email_file.close() email_text = email_text.encode('ascii', 'ignore') features = self.extract_features(email_text) self.classifier.train(features, label) print colored(self.classifier,'green')
from classifier import NaiveBayes from util import load_data tokenizer = RegexpTokenizer(r'\w+') spams = [] hams = [] load_data(hams, spams, 'res/SMSSpamCollection.txt') spam_words = [] ham_words = [] for spam in spams: spam_words += tokenizer.tokenize(spam) for ham in hams: ham_words += tokenizer.tokenize(ham) naive_bayes = NaiveBayes() naive_bayes.load(ham_words, spam_words) message = "" while message != "stop": message = input("Enter your SMS:") if naive_bayes.is_positive(tokenizer.tokenize(message)): print("ham") else: print("spam")
folds = np.array_split(data, 10) #make 10 folds in the dataset test_set = 0 #define test_set #for each folds treat one fold as test set and 9 fols at train set for y in range(len(folds)): X_train = pd.DataFrame() #if not test-set append fold in the train set for x in range(len(folds)): if x == test_set: y_test = folds[x]['class'].values X_test = folds[x].drop(['class'], axis=1) else: X_train = X_train.append(folds[x]) y_train = X_train['class'].values X_train = X_train.drop(['class'], axis=1) nb = NaiveBayes() #initialize Naive Bayes Classifier nb.fit(X_train, y_train) #train model with train data y_pred = nb.predict(X_test) #test model with test set #find error with respect to zero-one loss function error = nb.zero_one_loss_function(y_test, y_pred) printstr = "\nAccuracy of 0-1 loss for fold {0} ::: {1}".format( y, (1 - error)) print_both(file, printstr) accuracy_list.append((1 - error)) #get mean square error acc, precision, recall = nb.confusion_matrix(y_test, y_pred) printstr = "\nCF for fold {0} ::: acc:: {1} :: precision:: {2} :: recall :: {3}".format( y, acc, precision, recall) print_both(file, printstr) CF_accuracy_list.append(acc) CF_precision_list.append(precision)