def __init__(self, directory=os.path.abspath(os.path.join( '.', 'data', 'corpus1')), spam='spam', ham='ham', limit=1500): """ :param self: Trainer object :param directory: location of the training dataset :param spam: the sub directory inside the 'directory' which has spam :param ham: the sub directory inside the 'directory' which has ham :param limit: The maximum number of mails, the classifier should \ be trained over with """ self.spamdir = os.path.join(directory, spam) self.hamdir = os.path.join(directory, ham) self.limit = limit self.classifier = NaiveBayesClassifier()
class PluginTeachMe(Plugin): def __init__(self, *args): Plugin.__init__(self, *args) self.classifier = None self.load({}) self.curr_msg = '' self.last_msg = '' self.last_joke = () self.just_joked = False def load(self, data): storage_backend = MemoryBackend(data) self.classifier = NaiveBayesClassifier(storage_backend) def save(self): return self.classifier.storage.data def get_what_to_learn(self): if self.curr_msg in ('CMB', 'cmb'): return 'CMB' if self.curr_msg in ('CTB', 'ctb'): return 'CTB' if self.curr_msg in ('TWSS', 'twss'): return "That's what she said!" return 'None' def got_congratulated(self): return self.curr_msg in ('GG', 'gg', 'GG Tofbot', 'gg Tofbot') def did_bad_joke(self): return self.curr_msg in ('TG', 'tg', 'TG Tofbot', 'tg Tofbot') def handle_msg(self, msg_text, chan, nick): just_joked = self.just_joked self.just_joked = False self.last_msg = self.curr_msg self.curr_msg = msg_text.strip() if self.got_congratulated(): if self.last_joke: self.classifier.train(*self.last_joke) elif self.did_bad_joke(): if self.last_joke: self.classifier.train(self.last_joke[0], 'None') else: scores = self.classifier.classify(self.curr_msg.split()) joke = 'None' if scores: joke = scores[0][0] if joke != 'None': self.say(joke) self.last_joke = (self.curr_msg.split(), joke) else: if not just_joked: self.classifier.train(self.last_msg.split(), self.get_what_to_learn())
class PluginTeachMe(Plugin): def __init__(self, *args): Plugin.__init__(self, *args) self.classifier = None self.load({}) self.curr_msg = "" self.last_msg = "" self.last_joke = () self.just_joked = False def load(self, data): storage_backend = MemoryBackend(data) self.classifier = NaiveBayesClassifier(storage_backend) def save(self): return self.classifier.storage.data def get_what_to_learn(self): if self.curr_msg in ("CMB", "cmb"): return "CMB" if self.curr_msg in ("CTB", "ctb"): return "CTB" if self.curr_msg in ("TWSS", "twss"): return "That's what she said!" return "None" def got_congratulated(self): return self.curr_msg in ("GG", "gg", "GG Tofbot", "gg Tofbot") def did_bad_joke(self): return self.curr_msg in ("TG", "tg", "TG Tofbot", "tg Tofbot") def handle_msg(self, msg_text, chan, nick): just_joked = self.just_joked self.just_joked = False self.last_msg = self.curr_msg self.curr_msg = msg_text.strip() if self.got_congratulated(): if self.last_joke: self.classifier.train(*self.last_joke) elif self.did_bad_joke(): if self.last_joke: self.classifier.train(self.last_joke[0], "None") else: scores = self.classifier.classify(self.curr_msg.split()) joke = "None" if scores: joke = scores[0][0] if joke != "None": self.say(joke) self.last_joke = (self.curr_msg.split(), joke) else: if not just_joked: self.classifier.train(self.last_msg.split(), self.get_what_to_learn())
def __init__(self, directory=os.path.abspath( os.path.join('.', 'data', 'corpus1')), spam='spam', ham='ham', limit=1500 ): """ :param self: Trainer object :param directory: location of the training dataset :param spam: the sub directory inside the 'directory' which has spam :param ham: the sub directory inside the 'directory' which has ham :param limit: The maximum number of mails, the classifier should \ be trained over with """ self.spamdir = os.path.join(directory, spam) self.hamdir = os.path.join(directory, ham) self.limit = limit self.classifier = NaiveBayesClassifier()
__author__ = 'Nestor Bermudez' __email__ = '[email protected], [email protected]' from classifier import NaiveBayesClassifier from averageVectorFeatureExtractor import AverageVectorFeatureExtractor from parser import Parser from util import Util if __name__ == '__main__': import pdb import time start = time.clock() parser = Parser('part1data/yes_train.txt', 'part1data/no_train.txt') extractor = AverageVectorFeatureExtractor() classifier = NaiveBayesClassifier(smoothing=0.25) classifier.train(extractor.items(parser.items())) print('Training time: ' + str((time.clock() - start) * 1000) + 'ms') evaluationData = Parser('part1data/yes_test.txt', 'part1data/no_test.txt') confusion_matrix, acc = classifier.evaluate( extractor.items(evaluationData.items())) Util.print_confusion_matrix(confusion_matrix, 2, 2) print('Overall accuracy: ', round(acc * 100, 2)) labels = sorted(list(classifier.highest_likely_examples.keys())) for label in labels: features, _ = classifier.highest_likely_examples[label] print('Highest likelihood for class: ', label) Util.print_as_string(features, 25, 10) print('\n')
class Trainer(object): """ The trainer class """ def __init__(self, directory=os.path.abspath( os.path.join('.', 'data', 'corpus1')), spam='spam', ham='ham', limit=1500 ): """ :param self: Trainer object :param directory: location of the training dataset :param spam: the sub directory inside the 'directory' which has spam :param ham: the sub directory inside the 'directory' which has ham :param limit: The maximum number of mails, the classifier should \ be trained over with """ self.spamdir = os.path.join(directory, spam) self.hamdir = os.path.join(directory, ham) self.limit = limit self.classifier = NaiveBayesClassifier() def train_classifier(self, path, label, verbose): """ The function doing the actual classification here. :param self: Trainer object :param path: The path of the data directory :param label: The label underwhich the data directory is :param verbose: Decides the verbosity of the messages to be shown """ limit = len(os.listdir(path)) < self.limit and len(os.listdir(path)) \ or self.limit if verbose: print colored("Training {0} emails in {1} class".format( limit, label ), 'green' ) logging.debug("Training {0} emails in {1} class".format( limit, label ) ) # changing the path to that particular directory os.chdir(path) for email in os.listdir(path)[:self.limit]: if verbose and verbose > 1: print colored("Processing file: {0}".format(email), 'green') logging.info("Processing file: {0}".format(email)) email_file = open(email, 'r') # explicit better than implicit email_text = email_file.read() """ Don't even get me started on the Unicode issues that I faced here. Thankfullly 'BeautifulSoup' was there to our rescue. Thanks to Leonard Richardson for this module """ try: email_text = bs4.UnicodeDammit.detwingle( email_text).decode('utf-8') except: print colored("Skipping file {0} due to bad encoding".format(email), 'red') logging.error("Skipping file {0} due to bad encoding".format( os.path.join(path, email) ) ) continue email_file.close() email_text = email_text.encode("ascii", "ignore") # Extracting the features from the text features = self.extract_features(email_text) # Training the classifier self.classifier.train(features, label) """prints the __str__ overridden method in the class 'NaiveBayesClassier' """ print self.classifier def train(self, verbose=False): """ :param self: Trainer object :param verbose: Printing more details when Defaults to False """ self.train_classifier(self.spamdir, 'spam', verbose) self.train_classifier(self.hamdir, 'ham', verbose) return self.classifier def extract_features(self, text): """ Will convert the document into tokens and extract the features. Possible features - Attachments - Links in text - CAPSLOCK words - Numbers - Words in text So these are some possible features which would make an email a SPAM :param self: Trainer object :param text: Email text from which we will extract features :returns: A list which contains the feature set """ features = [] tokens = text.split() link = re.compile( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') # ^ for detecting whether the string is a link # Will use PorterStemmer() for stemming porterStemmer = stem.porter.PorterStemmer() # cleaning out the stopwords tokens = [ token for token in tokens if token not in stopwords.words( "english" ) ] for token in tokens: if len(token.translate(None, string.punctuation)) < 3: continue if token.isdigit(): features.append("NUMBER") elif "." + token in mimetypes.types_map.keys(): """ >>> import mimetypes >>> mimetypes.types_map.keys() ['.obj', '.ra', '.wsdl', '.dll', '.ras', '.ram', '.bcpio', '.sh', '.m1v', '.xwd', '.doc', '.bmp', '.shar', '.js', '.src', '.dvi', '.aif', '.ksh', '.dot', '.mht', '.p12', '.css', '.csh', '.pwz', '.pdf', '.cdf', '.pl', '.ai', '.jpe', '.jpg', '.py', '.xml', '.jpeg', '.ps', '.gtar', '.xpm', '.hdf', '.nws', '.tsv', '.xpdl', '.p7c', '.ico', '.eps', '.ief', '.so', '.xlb', '.pbm', '.texinfo', '.xls', '.tex', '.rtx', '.html', '.aiff', '.aifc', '.exe', '.sgm', '.tif', '.mpeg', '.ustar', '.gif', '.ppt', '.pps', '.sgml', '.ppm', '.latex', '.bat', '.mov', '.ppa', '.tr', '.rdf', '.xsl', '.eml', '.nc', '.sv4cpio', '.bin', '.h', '.tcl', '.wiz', '.o', '.a', '.c', '.wav', '.vcf', '.xbm', '.txt', '.au', '.t', '.tiff', '.texi', '.oda', '.ms', '.rgb', '.me', '.sv4crc', '.qt', '.mpa', '.mpg', '.mpe', '.avi', '.pgm', '.pot', '.mif', '.roff', '.htm', '.man', '.etx', '.zip', '.movie', '.pyc', '.png', '.pfx', '.mhtml', '.tar', '.pnm', '.pyo', '.snd', '.cpio', '.swf', '.mp3', '.mp2', '.mp4'] >>> """ features.append("ATTACHMENT") elif token.upper() == token: features.append("ALL_CAPS") features.append( porterStemmer.stem( token.translate(None, string.punctuation) ).lower() ) elif link.match(token): features.append("LINK") else: features.append( porterStemmer.stem(token.translate( None, string.punctuation ) ).lower() ) return features
data = list(csv.reader(f, delimiter="\t")) def clean(s): translator = str.maketrans("", "", string.punctuation) return s.translate(translator) def normalize_string(string): litter = ['.', ',', '!', '"', '\'', ':', ' -', ' —', '(', ')'] clear_string = string.lower() for symbol in litter: clear_string = clear_string.replace(symbol, '') return clear_string X, y = [], [] for target, msg in data: X.append(msg) y.append(target) X = [normalize_string(x) for x in X] X_train, y_train, X_test, y_test = X[:3900], y[:3900], X[3900:], y[3900:] model = NaiveBayesClassifier(1) model.fit(X_train, y_train) print(model.score(X_test, y_test))
for row in rows: [prediction] = model.predict([row.normal_title]) if prediction == 'good': news.append(row) return template('templates/news_recommendations', rows=news) def get_training_data(): rows = s.query(News).filter(News.label != None).all() X_train = [row.normal_title for row in rows] y_train = [row.label for row in rows] return X_train, y_train if __name__ == '__main__': s = session() X_train, y_train = get_training_data() model = NaiveBayesClassifier(1) model.fit(X_train, y_train) run(host='localhost', port=8080) # print(len(s.query(News).filter(News.label != None).all())) # cnt = 183 # X, y = get_training_data() # X_train, y_train, X_test, y_test = X[:cnt], y[:cnt], X[cnt:], y[cnt:] # model = NaiveBayesClassifier(1) # model.fit(X_train, y_train) # print(model.score(X_test, y_test))
return cnf_mat def split_dataset(dataset: pd.DataFrame, train_frac): train = dataset.sample(frac=train_frac, random_state=300660) test = dataset.drop(train.index) return train.drop(columns='class'), test.drop(columns='class'), \ train['class'], test['class'] # reading clean dataset main_df = pd.read_csv(r'seeds_dataset_clean.txt', header=None, sep='\t') main_df.columns = ['area', 'perimeter', 'compactness', 'kernel length', 'kernel width', 'asymmetry coef.', 'groove length', 'class'] nbc = NaiveBayesClassifier() gnb = GaussianNB() # finding best train/(train+test) ratio train_fractions = np.linspace(start=0.1, stop=0.9, num=17) nbc_prediction_accuracies = np.zeros((17, 1)) for idx, train_frac in enumerate(train_fractions): X_train, X_test, y_train, y_test = split_dataset(main_df, train_frac=train_frac) # alternatively sklearn.model_selection.train_test_split can be used nbc.fit(X_train, y_train) predictions = nbc.predict(X_test) nbc_prediction_accuracies[idx] = accuracy_score(y_test, predictions)
corpus_tokens = [] corpus_labels = [] for category in corpus.category_list: content = Tokenizer.load_category(category) if content: corpus_tokens.extend(content) corpus_labels.extend([corpus.category_list.index(category)] * len(content)) feature = Feature() feature.make_vsm(corpus_tokens) # feature.print_vsm() # reduce feature, k==0 means auto detect # feature.reducex(corpus_labels, cate_list=corpus.category_list) feature.reduce_feature(corpus_labels, k=0) feature_id = "feature.txt" feature.store(feature_id) # classify # lib svm classifier = LibSvmClassifier(feature_id) y_actual, y_predict = classifier.do_classify() Classifier.predict_info("Lib SVM", y_actual, y_predict) # sklearn svm classifier = SvmClassifier(feature.feature_vec, feature.feature_label) y_actual, y_predict = classifier.do_classify() Classifier.predict_info("Sklearn SVM", y_actual, y_predict) # naive bayes classifier = NaiveBayesClassifier(feature.feature_vec, feature.feature_label) y_actual, y_predict = classifier.do_classify() Classifier.predict_info("Naive Bayes", y_actual, y_predict)
# -*- coding: utf-8 -*- from classifier import NaiveBayesClassifier nbc = NaiveBayesClassifier( "iris-treinamento.txt", ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']) vars_combinations = [['Sepal Length', 'Sepal Width'], ['Sepal Length', 'Petal Width'], ['Sepal Length', 'Petal Length'], ['Petal Length', 'Petal Width'], ['Petal Length', 'Sepal Width'], ['Petal Width', 'Sepal Width']] for vars_combination in vars_combinations: nbc.plot_two_var_normal(vars_combination)
def load(self, data): storage_backend = MemoryBackend(data) self.classifier = NaiveBayesClassifier(storage_backend)
X_train_val, X_test, y_train_val, y_test, ted_ids, X_ted = build_X( datapath) print("X_train_val shape: {}, X_test shape: {}".format( X_train_val.shape, X_test.shape)) print("y_train_val shape: {}, y_test shape: {}".format( y_train_val.shape, y_test.shape)) X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1, random_state=42) print("X_train shape: {}, X_val shape: {}".format(X_train.shape, X_val.shape)) print("y_train shape: {}, y_val shape: {}".format(y_train.shape, y_val.shape)) nb_clf = NaiveBayesClassifier() nb_clf.fit(X_train, y_train) y_pred_val = nb_clf.predict(X_val) y_pred_test = nb_clf.predict(X_test) print('NB validation acc: {}'.format((y_pred_val == y_val).mean())) evaluate(y_test, y_pred_test) for k in [1, 5, 9]: knn_clf = KNNClassifier(k) knn_clf.fit(X_train, y_train) y_pred_val = knn_clf.predict(X_val) y_pred_test = knn_clf.predict(X_test) print('{}-nn validation acc: {}'.format(k, (y_pred_val == y_val).mean())) evaluate(y_test, y_pred_test)
from train_test_split import get_train_test_split, get_label_lookup from feature_extraction import features_from_file from classifier import NaiveBayesClassifier import os ROOT_DIR = os.path.dirname(os.path.realpath(__file__)) if __name__ == '__main__': train, test = get_train_test_split( os.path.join(ROOT_DIR, 'data', 'emails'), 0.6) label_lookup = get_label_lookup( os.path.join(ROOT_DIR, 'data', 'labels.txt')) nb_classifier = NaiveBayesClassifier() training_data = [ (label_lookup[x], features_from_file(os.path.join(ROOT_DIR, 'data', 'emails', x))) for x in train ] nb_classifier.train(training_data) true_positive = true_negative = false_positive = false_negative = 0 for filename in test: predicted_label = nb_classifier.classify( features_from_file( os.path.join(ROOT_DIR, 'data', 'emails', filename)), 'spam', 'not_spam') if predicted_label == 'spam' and label_lookup[filename] == 'spam': true_positive += 1 if predicted_label == 'not_spam' and label_lookup[ filename] == 'not_spam':