def __init__(self, directory=os.path.abspath(os.path.join( '.', 'data', 'corpus1')), spam='spam', ham='ham', limit=1500): """ :param self: Trainer object :param directory: location of the training dataset :param spam: the sub directory inside the 'directory' which has spam :param ham: the sub directory inside the 'directory' which has ham :param limit: The maximum number of mails, the classifier should \ be trained over with """ self.spamdir = os.path.join(directory, spam) self.hamdir = os.path.join(directory, ham) self.limit = limit self.classifier = NaiveBayesClassifier()
__author__ = 'Nestor Bermudez' __email__ = '[email protected], [email protected]' from classifier import NaiveBayesClassifier from averageVectorFeatureExtractor import AverageVectorFeatureExtractor from parser import Parser from util import Util if __name__ == '__main__': import pdb import time start = time.clock() parser = Parser('part1data/yes_train.txt', 'part1data/no_train.txt') extractor = AverageVectorFeatureExtractor() classifier = NaiveBayesClassifier(smoothing=0.25) classifier.train(extractor.items(parser.items())) print('Training time: ' + str((time.clock() - start) * 1000) + 'ms') evaluationData = Parser('part1data/yes_test.txt', 'part1data/no_test.txt') confusion_matrix, acc = classifier.evaluate( extractor.items(evaluationData.items())) Util.print_confusion_matrix(confusion_matrix, 2, 2) print('Overall accuracy: ', round(acc * 100, 2)) labels = sorted(list(classifier.highest_likely_examples.keys())) for label in labels: features, _ = classifier.highest_likely_examples[label] print('Highest likelihood for class: ', label) Util.print_as_string(features, 25, 10) print('\n')
data = list(csv.reader(f, delimiter="\t")) def clean(s): translator = str.maketrans("", "", string.punctuation) return s.translate(translator) def normalize_string(string): litter = ['.', ',', '!', '"', '\'', ':', ' -', ' —', '(', ')'] clear_string = string.lower() for symbol in litter: clear_string = clear_string.replace(symbol, '') return clear_string X, y = [], [] for target, msg in data: X.append(msg) y.append(target) X = [normalize_string(x) for x in X] X_train, y_train, X_test, y_test = X[:3900], y[:3900], X[3900:], y[3900:] model = NaiveBayesClassifier(1) model.fit(X_train, y_train) print(model.score(X_test, y_test))
__author__ = 'Nestor Bermudez' __email__ = '[email protected], [email protected]' from classifier import NaiveBayesClassifier from singleBlockFeatureExtractor import SingleBlockFeatureExtractor from blockGroupFeatureExtractor import BlockGroupFeatureExtractor from unsegmentedDataParser import UnsegmentedDataParser from parser import Parser from util import Util if __name__ == '__main__': import pdb import time start = time.clock() parser = UnsegmentedDataParser('extradata') extractor = BlockGroupFeatureExtractor() classifier = NaiveBayesClassifier(smoothing=1.5) classifier.train(extractor.items(parser.items())) print('Training time: ' + str((time.clock() - start) * 1000) + 'ms') evaluationData = Parser('part1data/yes_test.txt', 'part1data/no_test.txt') confusion_matrix, acc = classifier.evaluate( extractor.items(evaluationData.items())) Util.print_confusion_matrix(confusion_matrix, 2, 2) print('Overall accuracy: ', round(acc * 100, 2))
return cnf_mat def split_dataset(dataset: pd.DataFrame, train_frac): train = dataset.sample(frac=train_frac, random_state=300660) test = dataset.drop(train.index) return train.drop(columns='class'), test.drop(columns='class'), \ train['class'], test['class'] # reading clean dataset main_df = pd.read_csv(r'seeds_dataset_clean.txt', header=None, sep='\t') main_df.columns = ['area', 'perimeter', 'compactness', 'kernel length', 'kernel width', 'asymmetry coef.', 'groove length', 'class'] nbc = NaiveBayesClassifier() gnb = GaussianNB() # finding best train/(train+test) ratio train_fractions = np.linspace(start=0.1, stop=0.9, num=17) nbc_prediction_accuracies = np.zeros((17, 1)) for idx, train_frac in enumerate(train_fractions): X_train, X_test, y_train, y_test = split_dataset(main_df, train_frac=train_frac) # alternatively sklearn.model_selection.train_test_split can be used nbc.fit(X_train, y_train) predictions = nbc.predict(X_test) nbc_prediction_accuracies[idx] = accuracy_score(y_test, predictions)
corpus_tokens = [] corpus_labels = [] for category in corpus.category_list: content = Tokenizer.load_category(category) if content: corpus_tokens.extend(content) corpus_labels.extend([corpus.category_list.index(category)] * len(content)) feature = Feature() feature.make_vsm(corpus_tokens) # feature.print_vsm() # reduce feature, k==0 means auto detect # feature.reducex(corpus_labels, cate_list=corpus.category_list) feature.reduce_feature(corpus_labels, k=0) feature_id = "feature.txt" feature.store(feature_id) # classify # lib svm classifier = LibSvmClassifier(feature_id) y_actual, y_predict = classifier.do_classify() Classifier.predict_info("Lib SVM", y_actual, y_predict) # sklearn svm classifier = SvmClassifier(feature.feature_vec, feature.feature_label) y_actual, y_predict = classifier.do_classify() Classifier.predict_info("Sklearn SVM", y_actual, y_predict) # naive bayes classifier = NaiveBayesClassifier(feature.feature_vec, feature.feature_label) y_actual, y_predict = classifier.do_classify() Classifier.predict_info("Naive Bayes", y_actual, y_predict)
# -*- coding: utf-8 -*- from classifier import NaiveBayesClassifier nbc = NaiveBayesClassifier( "iris-treinamento.txt", ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']) vars_combinations = [['Sepal Length', 'Sepal Width'], ['Sepal Length', 'Petal Width'], ['Sepal Length', 'Petal Length'], ['Petal Length', 'Petal Width'], ['Petal Length', 'Sepal Width'], ['Petal Width', 'Sepal Width']] for vars_combination in vars_combinations: nbc.plot_two_var_normal(vars_combination)
def load(self, data): storage_backend = MemoryBackend(data) self.classifier = NaiveBayesClassifier(storage_backend)
X_train_val, X_test, y_train_val, y_test, ted_ids, X_ted = build_X( datapath) print("X_train_val shape: {}, X_test shape: {}".format( X_train_val.shape, X_test.shape)) print("y_train_val shape: {}, y_test shape: {}".format( y_train_val.shape, y_test.shape)) X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1, random_state=42) print("X_train shape: {}, X_val shape: {}".format(X_train.shape, X_val.shape)) print("y_train shape: {}, y_val shape: {}".format(y_train.shape, y_val.shape)) nb_clf = NaiveBayesClassifier() nb_clf.fit(X_train, y_train) y_pred_val = nb_clf.predict(X_val) y_pred_test = nb_clf.predict(X_test) print('NB validation acc: {}'.format((y_pred_val == y_val).mean())) evaluate(y_test, y_pred_test) for k in [1, 5, 9]: knn_clf = KNNClassifier(k) knn_clf.fit(X_train, y_train) y_pred_val = knn_clf.predict(X_val) y_pred_test = knn_clf.predict(X_test) print('{}-nn validation acc: {}'.format(k, (y_pred_val == y_val).mean())) evaluate(y_test, y_pred_test)