class PluginTeachMe(Plugin): def __init__(self, *args): Plugin.__init__(self, *args) self.classifier = None self.load({}) self.curr_msg = '' self.last_msg = '' self.last_joke = () self.just_joked = False def load(self, data): storage_backend = MemoryBackend(data) self.classifier = NaiveBayesClassifier(storage_backend) def save(self): return self.classifier.storage.data def get_what_to_learn(self): if self.curr_msg in ('CMB', 'cmb'): return 'CMB' if self.curr_msg in ('CTB', 'ctb'): return 'CTB' if self.curr_msg in ('TWSS', 'twss'): return "That's what she said!" return 'None' def got_congratulated(self): return self.curr_msg in ('GG', 'gg', 'GG Tofbot', 'gg Tofbot') def did_bad_joke(self): return self.curr_msg in ('TG', 'tg', 'TG Tofbot', 'tg Tofbot') def handle_msg(self, msg_text, chan, nick): just_joked = self.just_joked self.just_joked = False self.last_msg = self.curr_msg self.curr_msg = msg_text.strip() if self.got_congratulated(): if self.last_joke: self.classifier.train(*self.last_joke) elif self.did_bad_joke(): if self.last_joke: self.classifier.train(self.last_joke[0], 'None') else: scores = self.classifier.classify(self.curr_msg.split()) joke = 'None' if scores: joke = scores[0][0] if joke != 'None': self.say(joke) self.last_joke = (self.curr_msg.split(), joke) else: if not just_joked: self.classifier.train(self.last_msg.split(), self.get_what_to_learn())
class PluginTeachMe(Plugin): def __init__(self, *args): Plugin.__init__(self, *args) self.classifier = None self.load({}) self.curr_msg = "" self.last_msg = "" self.last_joke = () self.just_joked = False def load(self, data): storage_backend = MemoryBackend(data) self.classifier = NaiveBayesClassifier(storage_backend) def save(self): return self.classifier.storage.data def get_what_to_learn(self): if self.curr_msg in ("CMB", "cmb"): return "CMB" if self.curr_msg in ("CTB", "ctb"): return "CTB" if self.curr_msg in ("TWSS", "twss"): return "That's what she said!" return "None" def got_congratulated(self): return self.curr_msg in ("GG", "gg", "GG Tofbot", "gg Tofbot") def did_bad_joke(self): return self.curr_msg in ("TG", "tg", "TG Tofbot", "tg Tofbot") def handle_msg(self, msg_text, chan, nick): just_joked = self.just_joked self.just_joked = False self.last_msg = self.curr_msg self.curr_msg = msg_text.strip() if self.got_congratulated(): if self.last_joke: self.classifier.train(*self.last_joke) elif self.did_bad_joke(): if self.last_joke: self.classifier.train(self.last_joke[0], "None") else: scores = self.classifier.classify(self.curr_msg.split()) joke = "None" if scores: joke = scores[0][0] if joke != "None": self.say(joke) self.last_joke = (self.curr_msg.split(), joke) else: if not just_joked: self.classifier.train(self.last_msg.split(), self.get_what_to_learn())
__email__ = '[email protected], [email protected]' from classifier import NaiveBayesClassifier from averageVectorFeatureExtractor import AverageVectorFeatureExtractor from parser import Parser from util import Util if __name__ == '__main__': import pdb import time start = time.clock() parser = Parser('part1data/yes_train.txt', 'part1data/no_train.txt') extractor = AverageVectorFeatureExtractor() classifier = NaiveBayesClassifier(smoothing=0.25) classifier.train(extractor.items(parser.items())) print('Training time: ' + str((time.clock() - start) * 1000) + 'ms') evaluationData = Parser('part1data/yes_test.txt', 'part1data/no_test.txt') confusion_matrix, acc = classifier.evaluate( extractor.items(evaluationData.items())) Util.print_confusion_matrix(confusion_matrix, 2, 2) print('Overall accuracy: ', round(acc * 100, 2)) labels = sorted(list(classifier.highest_likely_examples.keys())) for label in labels: features, _ = classifier.highest_likely_examples[label] print('Highest likelihood for class: ', label) Util.print_as_string(features, 25, 10) print('\n')
class Trainer(object): """ The trainer class """ def __init__(self, directory=os.path.abspath( os.path.join('.', 'data', 'corpus1')), spam='spam', ham='ham', limit=1500 ): """ :param self: Trainer object :param directory: location of the training dataset :param spam: the sub directory inside the 'directory' which has spam :param ham: the sub directory inside the 'directory' which has ham :param limit: The maximum number of mails, the classifier should \ be trained over with """ self.spamdir = os.path.join(directory, spam) self.hamdir = os.path.join(directory, ham) self.limit = limit self.classifier = NaiveBayesClassifier() def train_classifier(self, path, label, verbose): """ The function doing the actual classification here. :param self: Trainer object :param path: The path of the data directory :param label: The label underwhich the data directory is :param verbose: Decides the verbosity of the messages to be shown """ limit = len(os.listdir(path)) < self.limit and len(os.listdir(path)) \ or self.limit if verbose: print colored("Training {0} emails in {1} class".format( limit, label ), 'green' ) logging.debug("Training {0} emails in {1} class".format( limit, label ) ) # changing the path to that particular directory os.chdir(path) for email in os.listdir(path)[:self.limit]: if verbose and verbose > 1: print colored("Processing file: {0}".format(email), 'green') logging.info("Processing file: {0}".format(email)) email_file = open(email, 'r') # explicit better than implicit email_text = email_file.read() """ Don't even get me started on the Unicode issues that I faced here. Thankfullly 'BeautifulSoup' was there to our rescue. Thanks to Leonard Richardson for this module """ try: email_text = bs4.UnicodeDammit.detwingle( email_text).decode('utf-8') except: print colored("Skipping file {0} due to bad encoding".format(email), 'red') logging.error("Skipping file {0} due to bad encoding".format( os.path.join(path, email) ) ) continue email_file.close() email_text = email_text.encode("ascii", "ignore") # Extracting the features from the text features = self.extract_features(email_text) # Training the classifier self.classifier.train(features, label) """prints the __str__ overridden method in the class 'NaiveBayesClassier' """ print self.classifier def train(self, verbose=False): """ :param self: Trainer object :param verbose: Printing more details when Defaults to False """ self.train_classifier(self.spamdir, 'spam', verbose) self.train_classifier(self.hamdir, 'ham', verbose) return self.classifier def extract_features(self, text): """ Will convert the document into tokens and extract the features. Possible features - Attachments - Links in text - CAPSLOCK words - Numbers - Words in text So these are some possible features which would make an email a SPAM :param self: Trainer object :param text: Email text from which we will extract features :returns: A list which contains the feature set """ features = [] tokens = text.split() link = re.compile( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') # ^ for detecting whether the string is a link # Will use PorterStemmer() for stemming porterStemmer = stem.porter.PorterStemmer() # cleaning out the stopwords tokens = [ token for token in tokens if token not in stopwords.words( "english" ) ] for token in tokens: if len(token.translate(None, string.punctuation)) < 3: continue if token.isdigit(): features.append("NUMBER") elif "." + token in mimetypes.types_map.keys(): """ >>> import mimetypes >>> mimetypes.types_map.keys() ['.obj', '.ra', '.wsdl', '.dll', '.ras', '.ram', '.bcpio', '.sh', '.m1v', '.xwd', '.doc', '.bmp', '.shar', '.js', '.src', '.dvi', '.aif', '.ksh', '.dot', '.mht', '.p12', '.css', '.csh', '.pwz', '.pdf', '.cdf', '.pl', '.ai', '.jpe', '.jpg', '.py', '.xml', '.jpeg', '.ps', '.gtar', '.xpm', '.hdf', '.nws', '.tsv', '.xpdl', '.p7c', '.ico', '.eps', '.ief', '.so', '.xlb', '.pbm', '.texinfo', '.xls', '.tex', '.rtx', '.html', '.aiff', '.aifc', '.exe', '.sgm', '.tif', '.mpeg', '.ustar', '.gif', '.ppt', '.pps', '.sgml', '.ppm', '.latex', '.bat', '.mov', '.ppa', '.tr', '.rdf', '.xsl', '.eml', '.nc', '.sv4cpio', '.bin', '.h', '.tcl', '.wiz', '.o', '.a', '.c', '.wav', '.vcf', '.xbm', '.txt', '.au', '.t', '.tiff', '.texi', '.oda', '.ms', '.rgb', '.me', '.sv4crc', '.qt', '.mpa', '.mpg', '.mpe', '.avi', '.pgm', '.pot', '.mif', '.roff', '.htm', '.man', '.etx', '.zip', '.movie', '.pyc', '.png', '.pfx', '.mhtml', '.tar', '.pnm', '.pyo', '.snd', '.cpio', '.swf', '.mp3', '.mp2', '.mp4'] >>> """ features.append("ATTACHMENT") elif token.upper() == token: features.append("ALL_CAPS") features.append( porterStemmer.stem( token.translate(None, string.punctuation) ).lower() ) elif link.match(token): features.append("LINK") else: features.append( porterStemmer.stem(token.translate( None, string.punctuation ) ).lower() ) return features
class Trainer(object): """ The trainer class """ def __init__(self, directory=os.path.abspath( os.path.join('.', 'data', 'corpus1')), spam='spam', ham='ham', limit=1500 ): """ :param self: Trainer object :param directory: location of the training dataset :param spam: the sub directory inside the 'directory' which has spam :param ham: the sub directory inside the 'directory' which has ham :param limit: The maximum number of mails, the classifier should \ be trained over with """ self.spamdir = os.path.join(directory, spam) self.hamdir = os.path.join(directory, ham) self.limit = limit self.classifier = NaiveBayesClassifier() def train_classifier(self, path, label, verbose): """ The function doing the actual classification here. :param self: Trainer object :param path: The path of the data directory :param label: The label underwhich the data directory is :param verbose: Decides the verbosity of the messages to be shown """ limit = len(os.listdir(path)) < self.limit and len(os.listdir(path)) \ or self.limit if verbose: print colored("Training {0} emails in {1} class".format( limit, label ), 'green' ) logging.debug("Training {0} emails in {1} class".format( limit, label ) ) # changing the path to that particular directory os.chdir(path) for email in os.listdir(path)[:self.limit]: if verbose and verbose > 1: print colored("Processing file: {0}".format(email), 'green') logging.info("Processing file: {0}".format(email)) email_file = open(email, 'r') # explicit better than implicit email_text = email_file.read() """ Don't even get me started on the Unicode issues that I faced here. Thankfullly 'BeautifulSoup' was there to our rescue. Thanks to Leonard Richardson for this module """ try: email_text = bs4.UnicodeDammit.detwingle( email_text).decode('utf-8') except: print colored("Skipping file {0} due to bad encoding".format(email), 'red') logging.error("Skipping file {0} due to bad encoding".format( os.path.join(path, email) ) ) continue email_file.close() email_text = email_text.encode("ascii", "ignore") # Extracting the features from the text features = self.extract_features(email_text) # Training the classifier self.classifier.train(features, label) """prints the __str__ overridden method in the class 'NaiveBayesClassier' """ print self.classifier def train(self, verbose=False): """ :param self: Trainer object :param verbose: Printing more details when Defaults to False """ self.train_classifier(self.spamdir, 'spam', verbose) self.train_classifier(self.hamdir, 'ham', verbose) return self.classifier def extract_features(self, text): """ Will convert the document into tokens and extract the features. Possible features - Attachments - Links in text - CAPSLOCK words - Numbers - Words in text So these are some possible features which would make an email a SPAM :param self: Trainer object :param text: Email text from which we will extract features :returns: A list which contains the feature set """ features = [] tokens = text.split() link = re.compile( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') # ^ for detecting whether the string is a link # Will use PorterStemmer() for stemming porterStemmer = stem.porter.PorterStemmer() # cleaning out the stopwords tokens = [ token for token in tokens if token not in stopwords.words( "english" ) ] for token in tokens: if len(token.translate(None, string.punctuation)) < 3: continue if token.isdigit(): features.append("NUMBER") elif "." + token in mimetypes.types_map.keys(): """ >>> import mimetypes >>> mimetypes.types_map.keys() ['.obj', '.ra', '.wsdl', '.dll', '.ras', '.ram', '.bcpio', '.sh', '.m1v', '.xwd', '.doc', '.bmp', '.shar', '.js', '.src', '.dvi', '.aif', '.ksh', '.dot', '.mht', '.p12', '.css', '.csh', '.pwz', '.pdf', '.cdf', '.pl', '.ai', '.jpe', '.jpg', '.py', '.xml', '.jpeg', '.ps', '.gtar', '.xpm', '.hdf', '.nws', '.tsv', '.xpdl', '.p7c', '.ico', '.eps', '.ief', '.so', '.xlb', '.pbm', '.texinfo', '.xls', '.tex', '.rtx', '.html', '.aiff', '.aifc', '.exe', '.sgm', '.tif', '.mpeg', '.ustar', '.gif', '.ppt', '.pps', '.sgml', '.ppm', '.latex', '.bat', '.mov', '.ppa', '.tr', '.rdf', '.xsl', '.eml', '.nc', '.sv4cpio', '.bin', '.h', '.tcl', '.wiz', '.o', '.a', '.c', '.wav', '.vcf', '.xbm', '.txt', '.au', '.t', '.tiff', '.texi', '.oda', '.ms', '.rgb', '.me', '.sv4crc', '.qt', '.mpa', '.mpg', '.mpe', '.avi', '.pgm', '.pot', '.mif', '.roff', '.htm', '.man', '.etx', '.zip', '.movie', '.pyc', '.png', '.pfx', '.mhtml', '.tar', '.pnm', '.pyo', '.snd', '.cpio', '.swf', '.mp3', '.mp2', '.mp4'] >>> """ features.append("ATTACHMENT") elif token.upper() == token: features.append("ALL_CAPS") features.append( porterStemmer.stem( token.translate(None, string.punctuation) ).lower() ) elif link.match(token): features.append("LINK") else: features.append( porterStemmer.stem(token.translate( None, string.punctuation ) ).lower() ) return features
ROOT_DIR = os.path.dirname(os.path.realpath(__file__)) if __name__ == '__main__': train, test = get_train_test_split( os.path.join(ROOT_DIR, 'data', 'emails'), 0.6) label_lookup = get_label_lookup( os.path.join(ROOT_DIR, 'data', 'labels.txt')) nb_classifier = NaiveBayesClassifier() training_data = [ (label_lookup[x], features_from_file(os.path.join(ROOT_DIR, 'data', 'emails', x))) for x in train ] nb_classifier.train(training_data) true_positive = true_negative = false_positive = false_negative = 0 for filename in test: predicted_label = nb_classifier.classify( features_from_file( os.path.join(ROOT_DIR, 'data', 'emails', filename)), 'spam', 'not_spam') if predicted_label == 'spam' and label_lookup[filename] == 'spam': true_positive += 1 if predicted_label == 'not_spam' and label_lookup[ filename] == 'not_spam': true_negative += 1 if predicted_label == 'spam' and label_lookup[filename] == 'not_spam': false_positive += 1 if predicted_label == 'not_spam' and label_lookup[filename] == 'spam':