def __init__(self): """ initialises the naive bayes """ self.classes = ["drug", "group", "brand", "drug_n", "none"] self.nb = NaiveBayes(self.classes) self.tagger = PosTagger() self.set_feature_names()
def __init__(self, spam_emails_path, ham_emails_path, unknown_emails_path): # 0 - No spam (Ham) # 1 - Spam self.naive_bayes = NaiveBayes([0, 1]) self.spam_emails_path = spam_emails_path self.ham_emails_path = ham_emails_path self.unknown_emails_path = unknown_emails_path
def test_blogs_bag(self): # Classify blog authors using bag-of-words train, test = self.split_blogs_corpus(BagOfWords) classifier = NaiveBayes() classifier.train(train) classified = classify(classifier, test) self.assertGreater(compute_all_stats(test, classified), 0.55)
def test_even_odd(self): """Classify numbers as even or odd""" classifier = NaiveBayes() classifier.train([EvenOdd(0, True), EvenOdd(1, False)]) test = [EvenOdd(i, i % 2 == 0) for i in range(2, 1000)] classified = classify(classifier, test) self.assertEqual(compute_all_stats(test, classified), 1.0)
def test_names_nltk(self): # Classify names using NLTK features train, test = self.split_names_corpus() classifier = NaiveBayes() classifier.train(train) classified = classify(classifier, test) self.assertGreater(compute_all_stats(test, classified), 0.70)
class Stacking(): def __init__(self): pass def fit(self, X, y): self.rf = RandomForest(num_trees=15, max_depth=np.inf) self.rf.fit(X, y) y_rf = self.rf.predict(X) self.nb = NaiveBayes() self.nb.fit(X, y) y_nb = self.nb.predict(X) self.knn = KNN(k=3) self.knn.fit(X, y) y_knn = self.knn.predict(X) newX = np.array([y_rf, y_nb, y_knn]).transpose() model = DecisionTree(max_depth=np.inf, stump_class=DecisionStumpErrorRate) self.model = model model.fit(newX, y) def predict(self, X): y_rf = self.rf.predict(X) y_nb = self.nb.predict(X) y_knn = self.knn.predict(X) x_test = np.array([y_rf, y_nb, y_knn]).transpose() return self.model.predict(x_test)
def test_blogs_bag(self): """Classify blog authors using bag-of-words""" print "\nsplit_blogs_corpus" train, test = self.split_blogs_corpus(BagOfWords) classifier = NaiveBayes() classifier.train(train) self.assertGreater(accuracy(classifier, test), 0.55)
def test_blogs_bag(self): """Classify blog authors using bag-of-words""" train, test = self.split_blogs_corpus(BagOfWords) classifier = NaiveBayes() classifier.train(train) #classifier.save("/home/anna/StatNLP/model_all.p") self.assertGreater(accuracy(classifier, test), 0.55)
def fit(self, X, y): # instantiate the input models rf = RandomForest(num_trees=15) knn = KNN(k=3) nb = NaiveBayes(num_classes=2) # Random Forest fit and predict rf.create_splits(X) rf.fit(X, y) rf_pred = rf.predict(X) # K-Nearest Neighbors fit and predict knn.fit(X, y) knn_pred = knn.predict(X) # Naive Bayes fit and predict nb.fit(X, y) nb_pred = nb.predict(X) # use predictions from input models as inputs for meta-classifiers meta_input = np.hstack((rf_pred.reshape( (rf_pred.size, 1)), knn_pred.reshape( (knn_pred.size, 1)), nb_pred.reshape((nb_pred.size, 1)))) # use Decision Tree as meta-classifier dt = DecisionTree(max_depth=np.inf) dt.fit(meta_input, y) self.rf = rf self.knn = knn self.nb = nb self.meta_classifier = dt
def test_names_nltk(self): """Classify names using NLTK features""" train, test = self.split_names_corpus() classifier = NaiveBayes() classifier.train(train) acc = accuracy(classifier, test) self.assertGreater(acc, 0.70)
def mnb(x_train, y_train, x_test, y_test): mynb = NaiveBayes() mynb.fit(x_train, y_train) pred = mynb.predict(x_test) print("The score of my Naive Bayes result (based on testing data): " + str(sum(pred == y_test) / len(pred))) return pred
def test_blogs_imba(self): train, test = self.split_blogs_corpus_imba(BagOfWords) classifier = NaiveBayes() classifier.train(train) # you don't need to pass this test classified = classify(classifier, test) self.assertGreater(compute_all_stats(test, classified), 0.1)
def main(): neg_revs = read_reviews_in_file("./rt-polaritydata/rt-polarity.neg") pos_revs = read_reviews_in_file("./rt-polaritydata/rt-polarity.pos") nb = NaiveBayes(neg_revs, pos_revs, val_split=0.2) nb.evaluate_naive_bayes() lr = LogisticRegression(neg_revs, pos_revs, val_split=0.2, lr=0.85, num_inter=1000) lr.evaluate_logistic_regression() lr = LogisticRegression(neg_revs, pos_revs, val_split=0.2, lr=0.85, num_inter=3000) lr.evaluate_logistic_regression() # Just for fun – tensorflow LogisticRegression_tf(neg_revs, pos_revs, val_split=0.2, lr=0.01, num_inter=200)
def __init__(self, no_of_testcases=100, verbose=True, nb=None, bw=None): self.logger = Logger('Comparer', 'logs\\comparer.log', is_verbose=verbose) self.load_html_structure() if nb is None: self.nb = NaiveBayes(verbose=False, test_set_count=no_of_testcases, no_of_grams=4) self.nb.ready() else: self.nb = nb self.nb.logger.is_verbose = False if bw is None: self.bw = BagOfWordSentiment(verbose=False, no_of_grams=4) self.bw.ready() else: self.bw = bw self.bw.logger.is_verbose = False self.no_of_testcases = no_of_testcases self.nb_correct, self.bw_correct, self.tb_correct = 0, 0, 0 self.nb_wrong, self.bw_wrong, self.tb_wrong = 0, 0, 0 self.nb_accuracy, self.bw_accuracy, self.tb_accuracy = 0, 0, 0 self.counter = 0 self.testcases = dict()
def test_names_nltk(self): """Classify names using NLTK features""" print "\ntest_names_nltk" train, test = self.split_names_corpus() classifier = NaiveBayes() classifier.train(train) self.assertGreater(accuracy(classifier, test), 0.70)
def test_even_odd(self): """Classify numbers as even or odd""" print "\ntest_even_odd" classifier = NaiveBayes() classifier.train([EvenOdd(0, True), EvenOdd(1, False)]) test = [EvenOdd(i, i % 2 == 0) for i in range(2, 1000)] self.assertEqual(accuracy(classifier, test), 1.0)
def test_collect_counts(self): classifier = NaiveBayes() classifier._collect_counts(self.training_set) cat_index = classifier.label_codebook.get_index('cat') dog_index = classifier.label_codebook.get_index('dog') purr_index = classifier.feature_codebook.get_index('purr') meow_index = classifier.feature_codebook.get_index('meow') bark_index = classifier.feature_codebook.get_index('bark') woof_index = classifier.feature_codebook.get_index('woof') print """Test collecting counts If any of these fails, check if you have updated the codebooks and check if the counts have been collected correctly without smoothing""" count_x_y = classifier.count_x_y_table self.assertEqual(count_x_y[purr_index, cat_index], 2) self.assertEqual(count_x_y[meow_index, cat_index], 2) self.assertEqual(count_x_y[bark_index, cat_index], 0) self.assertEqual(count_x_y[woof_index, cat_index], 1) self.assertEqual(count_x_y[purr_index, dog_index], 0) self.assertEqual(count_x_y[meow_index, dog_index], 1) self.assertEqual(count_x_y[bark_index, dog_index], 1) self.assertEqual(count_x_y[woof_index, dog_index], 2) count_y = classifier.count_y_table self.assertEqual(count_y[cat_index], 3) self.assertEqual(count_y[dog_index], 2)
def index(request): if request.method == 'POST': form = TextForm(request.POST) if form.is_valid(): db_data = SpamData.objects.all() training_data = np.empty([len(db_data), 58]) for i in range(0, len(db_data)): training_data[i,:] = db_data[i].get_data() input_vector = text_to_frequencies(form.cleaned_data['text']) bayes = NaiveBayes(training_data) classification = bayes.classify(input_vector) data = str(np.append(classification, input_vector)) str_class = "NOT SPAM" if classification == 1: str_class = "SPAM" return render(request, 'spam_classifier/results.html',{ 'input': ImmutableTextForm(request.POST), 'isspam': str_class, 'details': data }) else: form = TextForm() return render(request, 'spam_classifier/index.html', {'form': form})
def get_predictions_naive_bayes(train_data, train_target, test_data, q_tag=None): from naive_bayes import NaiveBayes nb = NaiveBayes(serial_filename=get_serial_filename_nb(q_tag=q_tag)) nb.train(train_data, train_target) return nb.get_predictions(test_data)
def setUp(self): self.naive_bayes = NaiveBayes() with open('./data/jojo.dat', 'r') as file: for rec in file: name, serif = rec.strip().split("\t") self.naive_bayes.category = name self.naive_bayes.word = serif self.naive_bayes.learn()
def k_cross_validation(X, T, K, binary=0): # K-fold cross-validation fold_len = len(X) // 5 X_folds = [] T_folds = [] k = 0 results = [] mean_test_accuracy = 0 mean_f_score = 0 stddev_test_accuracy = 0 stddev_f_score = 0 for i in range(K): X_folds.append(X[k:k + fold_len]) T_folds.append(T[k:k + fold_len]) k += fold_len if k > len(X): k = len(X) for i in range(K): X_test = X_folds[i] T_test = T_folds[i] X_train = [] T_train = [] for j in range(K): if j == i: continue X_train = X_train + X_folds[j] T_train = T_train + T_folds[j] model = NaiveBayes(alpha=1) prior, likelihood, classes, vocabulary = model.fit( X_train, T_train, binary) model_prediction = model.predict(X_test, prior, likelihood, classes, vocabulary) test_accuracy, f_score = model.evaluate(model_prediction, T_test) results.append((test_accuracy, f_score)) mean_test_accuracy += test_accuracy mean_f_score += f_score # print("TRAINING ACCURACY :", model.train_accuracy) # print("Run {} : test accuracy = {}, f-score = {}".format(i,test_accuracy,f_score)) mean_test_accuracy /= K mean_f_score /= K for i in range(len(results)): stddev_test_accuracy += (results[i][0] - mean_test_accuracy)**2 stddev_f_score += (results[i][1] - mean_f_score)**2 stddev_test_accuracy = np.sqrt(stddev_test_accuracy / len(results)) stddev_f_score = np.sqrt(stddev_f_score / len(results)) return results, mean_test_accuracy, mean_f_score, stddev_test_accuracy, stddev_f_score
def main(argv): setpath() try: opts, args = getopt.getopt(argv, "ht:e:", ["train=", "test="]) if (len(sys.argv) < 5): raise getopt.GetoptError(None) except getopt.GetoptError: print('\nusage: run.py -t <trainfile> -e <testfile>\n') sys.exit(2) for opt, arg in opts: if opt == '-h': print('run.py -t <trainfile> -e <testfile>') sys.exit() elif opt in ("-t", "--train"): trainfile = arg elif opt in ("-e", "--test"): testfile = arg from file_reader import FileReader fr = FileReader(testfile) from naive_bayes import NaiveBayes nb = NaiveBayes(trainfile) test_file_reader = FileReader(testfile) testData = test_file_reader.getRows() num_errors = 0 true_positive = 0 false_positive = 0 true_negative = 0 false_negative = 0 #Testing phase for idx, row in enumerate(testData): prediction = nb.binary_classify(row) if row[-1] != prediction: num_errors += 1.0 print("Error on row: %s" % str(idx + 1)) if row[-1] == '1': false_negative += 1 else: false_positive += 1 elif row[-1] == '0': true_negative += 1 else: true_positive += 1 print('\n\n--------------Error Count----------------') print(num_errors) print('\n\n--------------Accuracy----------------') print("\n\nThe Accuracy is " + str((len(testData) - num_errors) * 100 / len(testData)) + "%") print("\n===========The confusion matrix===========") print("\t No \t Yes") print("No \t", str(true_negative) + "\t", str(false_positive)) print("Yes \t", str(false_negative) + "\t", str(true_positive))
def test_predict_record_with_binary_dataset(self): expected_prediction = 1 test_record = [1, 1, 0] clf = NaiveBayes(self.extract_features) clf.fit(self.design_matrix, self.target_values) prediction = clf.predict_record(test_record) self.assertEqual(expected_prediction, prediction)
def main(): dataset = load_loan_defaulters() design_matrix = [row[:-1] for row in dataset] target_values = [row[-1] for row in dataset] clf = NaiveBayes(extract_features) clf.fit(design_matrix, target_values) prediction = clf.predict_record([1, 1, 50700]) negation_word = " not " if prediction == 0.0 else "" print("testing negative sentiment" + negation_word + "of the tweet")
def main(argv): setpath() try: opts, args = getopt.getopt(argv,"ht:e:",["train=","test="]) if(len(sys.argv) < 5): raise getopt.GetoptError(None) except getopt.GetoptError: print('\nusage: run.py -t <trainfile> -e <testfile>\n') sys.exit(2) for opt, arg in opts: if opt == '-h': print('run.py -t <trainfile> -e <testfile>') sys.exit() elif opt in ("-t", "--train"): trainfile = arg elif opt in ("-e", "--test"): testfile = arg from file_reader import FileReader fr = FileReader(testfile) from naive_bayes import NaiveBayes nb = NaiveBayes(trainfile) test_file_reader = FileReader(testfile) testData = test_file_reader.getRows() num_errors = 0 true_positive = 0 false_positive = 0 true_negative = 0 false_negative = 0 #Testing phase for idx, row in enumerate(testData): prediction = nb.binary_classify(row) if row[-1] != prediction: num_errors += 1.0 print("Error on row: %s" % str(idx+1)) if row[-1] == '1': false_negative += 1 else: false_positive += 1 elif row[-1] == '0': true_negative += 1 else: true_positive += 1 print('\n\n--------------Error Count----------------') print(num_errors) print('\n\n--------------Accuracy----------------') print("\n\nThe Accuracy is " +str((len(testData) - num_errors)*100/len(testData)) + "%") print("\n===========The confusion matrix===========") print("\t No \t Yes") print("No \t", str(true_negative) + "\t", str(false_positive)) print("Yes \t", str(false_negative) +"\t", str(true_positive))
def test_prediction(self): print """Test basic classification""" classifier = NaiveBayes() classifier.train(self.training_set) predictions = [classifier.classify_instance(x) for x in self.training_set] self.assertEqual(predictions[0], 'cat') self.assertEqual(predictions[1], 'dog') self.assertEqual(predictions[2], 'cat') self.assertEqual(predictions[3], 'dog') self.assertEqual(predictions[4], 'dog')
def main(): dataset = load_loan_defaulters() design_matrix = [row[:-1] for row in dataset] target_values = [row[-1] for row in dataset] clf = NaiveBayes(extract_features) clf.fit(design_matrix, target_values) prediction = clf.predict_record([1, 1, 50700]) negation_word = " not " if prediction == 0.0 else "" print("We predict this person will" + negation_word + "default on their loans.")
def cross_validation_nb(Xs_train, Ys_train, feats): """Optimizes a parameter for the naive bayes classifier using the cross validation technic""" folds = 5 kf = StratifiedKFold(n_splits=folds) #cross_error_list = [] bws = [] train_errors = [] val_errors = [] #For loop iterating every band-width value from 0.02 to 0.6 with a step of 0.02 for bw in np.arange(0.02, 0.6, 0.02): summed_train_errors = 0 summed_val_errors = 0 #Stratified k folds for train_idx, valid_idx in kf.split(Ys_train, Ys_train): #Obtain the training and validation folds from the training set x_train_set = Xs_train[train_idx] x_val_set = Xs_train[valid_idx] y_train_set = Ys_train[train_idx] y_val_set = Ys_train[valid_idx] # Calculate naive bayes for this specific bandwidth nb = NaiveBayes(bw, feats) train_error, kde_list, prior_class0, prior_class1 = nb.fit( x_train_set, y_train_set) val_error, pred_val = nb.predict(x_val_set, y_val_set, kde_list, prior_class0, prior_class1) summed_train_errors += train_error summed_val_errors += val_error bws.append(bw) train_errors.append(summed_train_errors / folds) val_errors.append(summed_val_errors / folds) # Choose best bandwidth best_bandwidth = 0 best_bw_val_error = 100 for i in range(len(bws)): bw = bws[i] if val_errors[i] < best_bw_val_error: best_bandwidth = bw best_bw_val_error = val_errors[i] print("Best BW training") print(best_bandwidth) return (best_bandwidth, bws, train_errors, val_errors)
def main(): parser = argparse.ArgumentParser( description='Naive Bayes for Spam Classification') parser.add_argument('--suffix', default="", type=str, help='Dataset to be used for training') parser.add_argument('--alpha', default=0.001, type=float, help='Smoothing factor of the model') args = parser.parse_args() # Load data data = DataLoader() data.load_data(args.suffix) # Initialize model model = NaiveBayes(data.vocab_size, args.alpha) # Train model.fit(data.trainX, data.trainY) # Evaluation predictions = [] tp, tn, fp, fn = 0, 0, 0, 0 for (x, y) in zip(data.testX, data.testY): probs = model.predict(x) labels = list(probs.keys()) probs = list(probs.values()) label = labels[np.argmax(probs)] if label == 0 and y == 0: tn += 1 elif label == 1 and y == 1: tp += 1 elif label == 0 and y == 1: fn += 1 elif label == 1 and y == 0: fp += 1 predictions.append(label) precision = tp / (tp + fp) recall = tp / (tp + fn) print(f"Precision: {precision * 100:.2f}%") print(f"Recall: {recall * 100:.2f}%") print(f"Accuracy: {(tp + tn) / len(data.testY) * 100:.2f}%") print(f"F1 Score: {2 * (precision * recall) / (precision + recall)}")
def naive_bayes(): X_train, X_test, y_train, y_test = data.load_dbworld() start = timeit.default_timer() nb = NaiveBayes() nb.fit(X_train, y_train) y_pred = nb.predict(X_test) stop = timeit.default_timer() print("Accuracy of the models is : %f" % ((y_pred == y_test).sum()/len(y_test))) print("Running time : %f" % (stop - start))
def train_and_val(): training_data = dp.read_data('dataset/splice-Xtrain.dat', 'dataset/splice-Ytrain.dat') training_set_indices, validation_set_indices = dp.read_training_val_set( 'dataset/train.txt', 'dataset/val.txt') feature = Features() features_labels_pair = feature.simple(training_data) training_set = [] for index in training_set_indices: training_set.append(features_labels_pair[index]) #dp.remove_ambiguous_entry(training_set) naive_bayes = NaiveBayes(training_set, 4, False) validation_set = [] for index in validation_set_indices: validation_set.append(features_labels_pair[index]) dp.remove_ambiguous_entry(validation_set) confusion_matrix = np.zeros([3, 3]) correct = 0.0 total = 0.0 for feature_vector, correct_class in validation_set: prediction = naive_bayes.predict(feature_vector) total += 1 if prediction == correct_class: correct += 1 if prediction == 0 and correct_class == 0: confusion_matrix[0, 0] += 1 if prediction == 0 and correct_class == 1: confusion_matrix[0, 1] += 1 if prediction == 0 and correct_class == 2: confusion_matrix[0, 2] += 1 if prediction == 1 and correct_class == 0: confusion_matrix[1, 0] += 1 if prediction == 1 and correct_class == 1: confusion_matrix[1, 1] += 1 if prediction == 1 and correct_class == 2: confusion_matrix[1, 2] += 1 if prediction == 2 and correct_class == 0: confusion_matrix[2, 0] += 1 if prediction == 2 and correct_class == 1: confusion_matrix[2, 1] += 1 if prediction == 2 and correct_class == 2: confusion_matrix[2, 2] += 1 #print prediction, correct_class print confusion_matrix print correct / total
def __init__(self): # download stopwords nltk.download('stopwords') # tweets and their labels self.tweets = [] self.labels = [] # retrieve tweets file = open("tweets.txt", "r") # tweet to be added to the tweets list tweet = "" # line to be read from the file line = file.readline() while line: # if the line is the label, add tweet and its label to the corresponding lists if line.startswith("$$$$$"): # add tweet self.tweets.append(tweet) # add label self.labels.append(int(line[5:].replace("\n", ""))) # clear tweet object tweet = "" # else, the line is a part of the tweet else: tweet += line.replace("\n", "").strip().lower() # read new line line = file.readline() # Preprocessing preprocessor = Preprocessor(self.tweets, nltk.PorterStemmer()) tweets = preprocessor.start() # Tokenize tweets self.tokenizer = Tokenizer() self.tokenizer.fit_on_texts(tweets) tokenized_tweets = self.tokenizer.texts_to_sequences(tweets) num_tokens = [len(tokens) for tokens in tokenized_tweets] num_tokens = np.array(num_tokens) self.max_tokens = int(np.mean(num_tokens) + 2 * np.std(num_tokens)) tokenized_tweets_padding = pad_sequences(tokenized_tweets, maxlen=self.max_tokens) X_train, X_test, y_train, y_test = train_test_split(tokenized_tweets_padding, self.labels, test_size=0.2, random_state=123) self.nb = NaiveBayes() self.nb.fit(X_train, y_train)
def setUpClass(cls): cls.dataset = cls.get_six_separable_points() cls.design_matrix = [row[:-1] for row in cls.dataset] cls.target_values = [row[-1] for row in cls.dataset] cls.clf = NaiveBayes(cls.extract_features) cls.clf.fit(cls.design_matrix, cls.target_values)
def setUpClass(cls): cls.dataset = load_loan_defaulters() cls.design_matrix = [row[:-1] for row in cls.dataset] cls.target_values = [row[-1] for row in cls.dataset] cls.clf = NaiveBayes(cls.extract_features) cls.clf.fit(cls.design_matrix, cls.target_values)
def main(): parser = argparse.ArgumentParser() parser.add_argument("labeled_data", help="The data file of labeled SMS texts") parser.add_argument("non_labeled_data", help="The data file of non-labeled SMS texts") parser.add_argument("database_file", help="The path of file to store the database") args = parser.parse_args() db_file = open(args.database_file, "wb") # 载入已分类数据集 load_dataset(args.labeled_data, True) # 将已分类的数据集划分为训练集(90%)和测试集(10%),以测试分类精度 labeled_documents = list(documents) random.shuffle(labeled_documents) labeled_count = len(labeled_documents) labeled_train_count = int(math.ceil(labeled_count * 0.9)) labeled_train = labeled_documents[:labeled_train_count] labeled_test = labeled_documents[labeled_train_count:] # 载入未分类数据集 load_dataset(args.non_labeled_data, False) calc_tfidf() # 训练分类器 classifier = NaiveBayes( 2, [(document.words, document.label) for document in labeled_train if document.label is not None]) # 测试分类器精度 confusion_matrix = [[0, 0], [0, 0]] for document in labeled_test: predicted_label = classifier.predict(document.words) confusion_matrix[document.label][predicted_label] += 1 print( "Confusion Matrix:\tReal 0\t\tReal 1\n\tPredicted 0\t%d\t\t%d\n\tPredicted 1\t%d\t\t%d\n" % (confusion_matrix[0][0], confusion_matrix[0][1], confusion_matrix[1][0], confusion_matrix[1][1])) # 对无标签数据进行分类 for document in documents: if document.label is None: document.label = classifier.predict(document.words) # 保存搜索数据库 pickle.dump(Database(documents, keywords), db_file)
def train_and_val(): training_data = dp.read_data('dataset/splice-Xtrain.dat', 'dataset/splice-Ytrain.dat') training_set_indices, validation_set_indices = dp.read_training_val_set('dataset/train.txt', 'dataset/val.txt') feature = Features() features_labels_pair = feature.simple(training_data) training_set = [] for index in training_set_indices: training_set.append(features_labels_pair[index]) #dp.remove_ambiguous_entry(training_set) naive_bayes = NaiveBayes(training_set, 4, False) validation_set = [] for index in validation_set_indices: validation_set.append(features_labels_pair[index]) dp.remove_ambiguous_entry(validation_set) confusion_matrix = np.zeros([3,3]) correct = 0.0 total = 0.0 for feature_vector, correct_class in validation_set: prediction = naive_bayes.predict(feature_vector) total += 1 if prediction == correct_class: correct += 1 if prediction == 0 and correct_class == 0: confusion_matrix[0,0] += 1 if prediction == 0 and correct_class == 1: confusion_matrix[0,1] += 1 if prediction == 0 and correct_class == 2: confusion_matrix[0,2] += 1 if prediction == 1 and correct_class == 0: confusion_matrix[1,0] += 1 if prediction == 1 and correct_class == 1: confusion_matrix[1,1] += 1 if prediction == 1 and correct_class == 2: confusion_matrix[1,2] += 1 if prediction == 2 and correct_class == 0: confusion_matrix[2,0] += 1 if prediction == 2 and correct_class == 1: confusion_matrix[2,1] += 1 if prediction == 2 and correct_class == 2: confusion_matrix[2,2] += 1 #print prediction, correct_class print confusion_matrix print correct/total
def test_save_load(self): """Test saving and loading with blog classifier""" train, test = self.split_blogs_corpus(BlogFeatures) classifier = NaiveBayes() classifier.train(train) classifier.save("model") class2 = NaiveBayes() class2.load("model") self.assertGreater(accuracy(class2, test), 0.55)
def cross_validation(corpus, idf): nb_results = {'precision': [], 'recall': [], 'f1': []} knn_results = {'precision': [], 'recall': [], 'f1': []} vocab = sorted(idf.keys()) random.shuffle(corpus) for i in range(10): print('cross validation', i) training, testing = split_data(corpus, i, 10) nb = NaiveBayes(training, vocab, 0.1) knn = KNN(5, 5) knn.fit([d.vector for d in training], [d.label for d in training]) labels = [d.label for d in testing] nb_preds = [nb.predict(d) for d in testing] knn_preds = [knn.predict(d.vector) for d in testing] metrics = model_metrics(labels, nb_preds) for m, k in zip(metrics, ['precision', 'recall', 'f1']): nb_results[k].append(m) metrics = model_metrics(labels, knn_preds) for m, k in zip(metrics, ['precision', 'recall', 'f1']): knn_results[k].append(m) for m in ['precision', 'recall', 'f1']: print('nb', m) print(nb_results[m]) print(m, 'nb mean', mean(nb_results[m])) print('knn', m) print(knn_results[m]) print(m, 'knn mean', mean(knn_results[m])) diff = [a - b for a, b in zip(nb_results[m], knn_results[m])] print(m, 'diff') print(diff) t = mean(diff) / (stdev(diff) / len(diff)**0.5) print(m, 't value:', t)
def rebuild_models(self): ''' Rebuilds all models over the current labeled datasets. ''' datasets = self.labeled_datasets if self.undersample_first: print "undersampling before building models.." datasets = self.undersample_labeled_datasets() all_train_sets, labels = self._datasets_to_matrices(datasets) self.models = [NB_Model(NaiveBayes.train(training_set, labels)) for training_set in all_train_sets]
def test_smoothing(self): print """Test smoothing Zero counts must not result in zero probability. When turning the counts into probability, some smoothing must be done """ classifier = NaiveBayes() classifier.train(self.training_set) cat_index = classifier.label_codebook.get_index('cat') dog_index = classifier.label_codebook.get_index('dog') purr_index = classifier.feature_codebook.get_index('purr') bark_index = classifier.feature_codebook.get_index('bark') p_x_given_y = classifier.p_x_given_y_table self.assertNotEqual(p_x_given_y[bark_index, cat_index], 0) self.assertNotEqual(p_x_given_y[purr_index, dog_index], 0)
def test_probability_tables(self): print """Test probability tables Regardless of what kind of smoothing you do, the signs have to be right """ classifier = NaiveBayes() classifier.train(self.training_set) cat_index = classifier.label_codebook.get_index('cat') dog_index = classifier.label_codebook.get_index('dog') meow_index = classifier.feature_codebook.get_index('meow') bark_index = classifier.feature_codebook.get_index('bark') p_x_given_y = classifier.p_x_given_y_table self.assertGreater(p_x_given_y[meow_index, cat_index], p_x_given_y[meow_index, dog_index]) self.assertLess(p_x_given_y[bark_index, cat_index], p_x_given_y[bark_index, dog_index]) p_y = classifier.p_y_table self.assertGreater(p_y[cat_index], p_y[dog_index])
def test_save_load_blogs_bag(self): train, test = self.split_blogs_corpus(BagOfWords) classifier = NaiveBayes() classifier.train(train) classifier.save('trained_model.p') c2 = NaiveBayes() c2.load('trained_model.p') self.assertEqual(classifier.model, c2.model) self.assertEqual(classifier.priorCount, c2.priorCount) self.assertEqual(classifier.countPerFeature, c2.countPerFeature)
def train_and_test(): training_data = dp.read_data("dataset/splice-Xtrain.dat", "dataset/splice-Ytrain.dat") test_data = dp.read_data("dataset/test40.txt", "dataset/ytest40.txt") feature = Features() training_set = feature.simple(training_data) test_set = feature.simple(test_data) # dp.remove_ambiguous_entry(training_set) naive_bayes = NaiveBayes(training_set, 4, False) confusion_matrix = np.zeros([3, 3]) correct = 0.0 total = 0.0 for index in range(len(test_set)): feature_vector, correct_class = test_set[index] prediction = naive_bayes.predict(feature_vector) total += 1 if prediction == correct_class: correct += 1 if prediction == 0 and correct_class == 0: confusion_matrix[0, 0] += 1 if prediction == 0 and correct_class == 1: confusion_matrix[0, 1] += 1 if prediction == 0 and correct_class == 2: confusion_matrix[0, 2] += 1 if prediction == 1 and correct_class == 0: confusion_matrix[1, 0] += 1 if prediction == 1 and correct_class == 1: confusion_matrix[1, 1] += 1 if prediction == 1 and correct_class == 2: confusion_matrix[1, 2] += 1 if prediction == 2 and correct_class == 0: confusion_matrix[2, 0] += 1 if prediction == 2 and correct_class == 1: confusion_matrix[2, 1] += 1 if prediction == 2 and correct_class == 2: confusion_matrix[2, 2] += 1 print confusion_matrix print correct / total
def main(args): train_file_path = "./data/restaurant_train.txt" test_file_path = "./data/restaurant_test.txt" model_file_path = "./perc_mod.m" data = Data() data.read_train_file(train_file_path) data.read_test_file(test_file_path) model = NaiveBayes(data) model.train(data) model = Perceptron(data) model.train(data, 5)
#In this file we extract the feature from .txt files and we assume that the training process uses the #raw_data of instance to train while the testing precess using the data of instance to test. But for #convenience of using the test_naive_bayes.py, I populate the data and raw_data of instance at the #same time # coding: utf-8 import nltk from helper import Alphabet, Instance from naive_bayes import NaiveBayes import util from evaluator import split_train_test import random import argparse #filelist1 = get_ipython().getoutput(u'ls txt_sentoken/neg/') #filelist2 = get_ipython().getoutput(u'ls txt_sentoken/pos/') #ins_list = [] nb = NaiveBayes() ID=None limits=None #get the feature selection function number parser = argparse.ArgumentParser(description ='choose certain feature selection fucntion') parser.add_argument('ID',metavar='N',type=int) parser.add_argument('limits', metavar='N',type=int) args = parser.parse_args() ID = args.ID limits = args.limits def load_instance(filepath): ins_list=[] filelist = get_ipython().getoutput(u'ls '+filepath) for filename in filelist: f = open(filepath+filename,'r')
def main(): parser = AP.ArgumentParser(description = "A command-line interface for " \ "the maximum entropy classifier.") parser.add_argument("-d", "--datafile", action = "store", default = "blog-gender-dataset.txt", \ help = "specify the input data file (default: ") parser.add_argument("-g", "--gaussian_prior", dest = "gpv", action = "store", \ help = "specify the Gaussian prior variance") parser.add_argument("-m", "--mode", dest = "mode", action = "store", default = "train", \ help = "run as train, train/ test, exp(eriment)1, exp(eriment)2, exp(eriment)3") parser.add_argument("-s", "--save", dest = "outfile", action = "store", default = None, \ help = "specify output file to serialize trained classifier") parser.add_argument("-l", "--load", dest = "infile", action = "store", default = None, \ help = "specify input file to load trained classifier") parser.add_argument("-i", "--instances", dest = "instances", action = "store", default = None, \ help = "load preprocessed instances instead of data") parser.add_argument("-f", "--featurefile", dest = "featfile", action = "store", default = None, \ help = "serialize preprocessed instances") args = parser.parse_args() #parse argument structure #begin running classifier try: print "Importing data ... " if args.instances: #get serialized features instance_list = cPickle.load(open(args.instances, 'rb')) print "Done." else: #create features from data data_list = import_data(args.datafile) print "Done.\nExtracting features ... " instance_list = [] l = len(data_list) for i, (label, post) in enumerate(data_list): print "Featurizing string %d of %d ... " % (i, l) instance_list.append(Instance(label = label, data = featurize(post))) print "Done." if args.featfile: #serialize instance_list with open(args.featfile, 'wb') as outf: cPickle.dump(instance_list, outf) piv1 = int(.7 * len(instance_list)) #split training from test piv2 = int(.9 * len(instance_list)) #split test from dev training, test, dev = instance_list[:piv1], instance_list[piv1:piv2], \ instance_list[piv2:] if args.infile: #load a previously trained classifier with open(args.infile, 'rb') as inf: me_classifier = MaxEnt.from_dict(cPickle.load(inf)) else: #create a new classifier exec('me_classifier = MaxEnt(%s)' % args.gpv) #experiment one if re.search(r'exp.*1', args.mode): if not args.infile: print "Training classifier ... " me_classifier.train(training) print "Done.\nTesting classification ... " if args.outfile: with open(args.outfile, 'wb') as outf: cPickle.dump(me_classifier.to_dict(), outf) for data in [training, test]: test_classifier(me_classifier, data).print_out() #experiment two; run in batch as for i in {.05,...,numpy.Infinity} ... #run with -s $i.classifier elif re.search(r'exp.*2', args.mode): #for value in [.05, 0.1, .5, 1, 3, 5, 10, numpy.Infinity]: #for value in [10, numpy.Infinity]: #me_classifier = MaxEnt(value) print "Training classifier with Gaussian prior variance %s ..." \ % str(me_classifier.gaussian_prior_variance) me_classifier.train(training) print "Done. Testing classifier over dev set ..." test_classifier(me_classifier, dev).print_out() print "Done. Testing classifier over test set ..." test_classifier(me_classifier, test).print_out() print "Done.\n\n\n" #experiment three; run with -l 1.classifier elif re.search(r'exp.*3', args.mode): if not args.infile: print "Training Maximum Entropy classifier ... " me_classifier.train(training) print "Done." nb_classifier = NaiveBayes() print "Training Naive Bayes classifier ... " nb_classifier.train(training) print "Done.\nTesting Maximum Entropy over test set ... " test_classifier(me_classifier, test).print_out() print "Done.\nTesting Naive Bayes over test set ... " test_classifier(nb_classifier, test).print_out() if args.outfile: #serialize trained classifier with open(args.outfile, 'wb') as outf: cPickle.dump(me_classifier.to_dict(), outf) except: #something is WROOOONG parser.print_help() raise
from similarity_calculator import SimilarityCalculator from naive_bayes import NaiveBayes import constants import pickle import sys import pdb from collections import OrderedDict if __name__ == '__main__': sc = SimilarityCalculator() with open(constants.NB_PKL_FILENAME, 'rb') as f: nb_classifier = pickle.load(f) nb_input = NaiveBayes() for query in sys.stdin: nb_input.word_count = {} nb_input.train(query, 'input') results = OrderedDict() for category in nb_classifier.word_count: sim_cos = sc.sim_cos(nb_input.word_count['input'], nb_classifier.word_count[category]) results[category] = sim_cos for result in results: print('カテゴリ「%s」との類以度は %f です' % (result, results[result])) best_score_before = 0.0 best_category = '' for i, category in enumerate(results):
import os import pickle import constants from naive_bayes import NaiveBayes import utils if __name__ == '__main__': utils.go_to_fetched_pages_dir() pages = utils.load_html_files() pkl_nb_path = os.path.join('..', constants.NB_PKL_FILENAME) # もしすでにNaiveBayesオブジェクトをpickle保存していたらそれを学習させる if os.path.exists(pkl_nb_path): with open(pkl_nb_path, 'rb') as f: nb = pickle.load(f) else: nb = NaiveBayes() for page in pages: nb.train(page.html_body, constants.QUERY) # せっかく学習させたんだから保存しよう with open(pkl_nb_path, 'wb') as f: pickle.dump(nb, f)
from helper import Instance from naive_bayes import NaiveBayes cat1 = Instance(label='cat', data=[1,0], raw_data=['purr' ,'purr' ,'meow']) cat2 = Instance(label='cat', data=[0,2],raw_data=['meow','woof']) cat3 = Instance(label='cat', data=[1],raw_data=['purr']) dog1 = Instance(label='dog', data=[3,2],raw_data=['bark','woof']) dog2 = Instance(label='dog', data=[2,0],raw_data=['woof','meow']) training_set = [cat1, cat2, cat3, dog1, dog2] classifier = NaiveBayes() classifier._collect_counts(training_set) cat_index = classifier.label_codebook.get_index('cat') dog_index = classifier.label_codebook.get_index('dog') purr_index = classifier.feature_codebook.get_index('purr') meow_index = classifier.feature_codebook.get_index('meow') bark_index = classifier.feature_codebook.get_index('bark') woof_index = classifier.feature_codebook.get_index('woof') """Test counting""" count_x_y = classifier.count_table count_x_y[purr_index, cat_index] == (2+1) count_x_y[meow_index, cat_index] == (2+1) count_x_y[bark_index, cat_index] == (0+1) count_x_y[woof_index, cat_index] == (1+1) count_x_y[purr_index, dog_index] == (0+1) count_x_y[meow_index, dog_index] == (1+1) count_x_y[bark_index, dog_index] == (1+1)
test_data_raw = [("dogs like the pool", 0), ("seals like the sea", 1)] def create_data(data): counts = list() labels = list() for text, label in data: counts.append(Counter(text.split())) labels.append(label) return (counts, labels) if __name__ == "__main__": train_data = create_data(train_data_raw) print("word_counts: {0}\nlabels: {1}\n" .format(train_data[0], train_data[1])) NB = NaiveBayes() NB.train(train_data) print("p_label0: {0}\np_label1: {1}\n" .format(exp(NB.p_c[0]), exp(NB.p_c[1]))) label0_p_x_given_c = [(word, exp(prob)) for word, prob in NB.p_x_given_c[0].items()] label1_p_x_given_c = [(word, exp(prob)) for word, prob in NB.p_x_given_c[1].items()] print("p_x_given_c_label0: {0}\np_x_given_c_label1: {1}\n" .format(label0_p_x_given_c, label1_p_x_given_c)) test_data = create_data(test_data_raw) predictions = NB.predict(test_data) data, labels = test_data NB.report(predictions, labels)
X = pca.transform(X, n_components=5) # Reduce to 5 dimensions # .......................... # TRAIN / TEST SPLIT # .......................... X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) # Rescale label for Adaboost to {-1, 1} rescaled_y_train = 2*y_train - np.ones(np.shape(y_train)) rescaled_y_test = 2*y_test - np.ones(np.shape(y_test)) # ....... # SETUP # ....... adaboost = Adaboost(n_clf = 8) naive_bayes = NaiveBayes() knn = KNN(k=4) logistic_regression = LogisticRegression() mlp = MultilayerPerceptron(n_hidden=20) perceptron = Perceptron() decision_tree = DecisionTree() random_forest = RandomForest(n_estimators=150) support_vector_machine = SupportVectorMachine(C=1, kernel=rbf_kernel) # ........ # TRAIN # ........ print "Training:" print "\tAdaboost" adaboost.fit(X_train, rescaled_y_train) print "\tNaive Bayes"
if __name__ == '__main__': data = np.genfromtxt('data/spam.csv', delimiter=',') y = data[:, -1] X = data[:, 0:-1] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) print 'Train shape:', X_train.shape print 'Test shape:', X_test.shape print print "My Implementation:" my_nb = NaiveBayes() my_nb.fit(X_train, y_train) print 'Accuracy:', my_nb.score(X_test, y_test) my_predictions = my_nb.predict(X_test) print print "sklearn's Implementation" mnb = MultinomialNB() mnb.fit(X_train, y_train) print 'Accuracy:', mnb.score(X_test, y_test) sklearn_predictions = mnb.predict(X_test) # Assert I get the same results as sklearn # (will give an error if different) assert np.all(sklearn_predictions == my_predictions)
def test_even_odd(self): """Classify numbers as even or odd""" classifier = NaiveBayes() classifier.train([EvenOdd(0, True), EvenOdd(1, False)]) test = [EvenOdd(i, i % 2 == 0) for i in range(2, 1000)] self.assertEqual(accuracy(classifier, test), 1.0)
def test_blogs_bag(self): """Classify blog authors using bag-of-words""" train, test = self.split_blogs_corpus(BagOfWords) classifier = NaiveBayes() classifier.train(train) self.assertGreater(accuracy(classifier, test), 0.55)
from naive_bayes import NaiveBayes import constants import pickle import sys import pdb from collections import OrderedDict if __name__ == '__main__': sc = SimCalculator() with open(constants.NB_PKL_FILENAME, 'rb') as f: nb_classifier = pickle.load(f) # 標準入力した文字列を、trainとword_countを使って {'input': {'スギ花粉': 4, '薬':3}}という形式に整形するためNBオブジェクトにした # 分類器としては使わないので本当は別のクラスを作ってやるべきだがめんどい nb_input = NaiveBayes() for query in sys.stdin: nb_input.word_count = {} # 二回目以降のinputのための初期化 nb_input.train(query, 'input') # 標準入力で入れた文字列を'input'カテゴリとして学習 results = OrderedDict() for category in nb_classifier.word_count: # sim_cosのかわりにsim_simpsonも使える sim_cos = sc.sim_cos(nb_input.word_count['input'], nb_classifier.word_count[category]) results[category] = sim_cos for result in results: print('カテゴリー「%s」との類似度は %f です' % (result, results[result])) # http://cointoss.hatenablog.com/entry/2013/10/16/123129 の通りやってもmaxのkey取れない(´・ω・`) best_score_before = 0.0
def main(): if not RUN_DIRTY: clean_up() if not RUN_DIRTY and not os.path.exists(TRAINING_PATH) and not os.path.exists(TEST_PATH): print("Splicing raw data") split_data.split_data(RAW_PATH) training_labels_file = TRAINING_PATH + "/_label" test_labels_file = TEST_PATH + "/_label" print("Reading labels") training_labels = file_util.read_line_list(training_labels_file) test_labels = file_util.read_line_list(test_labels_file) training_tokens_path = tokenizer.get_token_path(TRAINING_PATH) test_tokens_path = tokenizer.get_token_path(TEST_PATH) print("Tokenizing...") if not os.path.exists(training_tokens_path): print("Tokenizing training set...") tokenizer.tokenize_path(TRAINING_PATH) print("Training set tokenization complete") if not os.path.exists(test_tokens_path): print("Tokenizing test set...") tokenizer.tokenize_path(TEST_PATH) print("Test set tokenization complete") print("Reading tokens") training_set_tokens = article_util.load_tokenized_articals(training_tokens_path) test_set_tokens = article_util.load_tokenized_articals(test_tokens_path) print("Training naive bayes") naive_bayes = NaiveBayes(training_set_tokens, training_labels) print("Validating with training set") training_true_positives = 0 training_false_positives = 0 training_false_negative = 0 for i in range(len(training_set_tokens)): predictedClass = naive_bayes.classify(training_set_tokens[i], N_OF_WORDS, N_OF_COMMAS) if VERBOSE: print("Predicted " + training_labels[i] + " as " + predictedClass) if predictedClass == training_labels[i]: training_true_positives += 1 else: training_false_positives += 1 training_false_negative += 1 training_precisions = training_true_positives / ( training_true_positives + training_false_positives) training__recall = training_true_positives / ( training_true_positives + training_false_negative) training_class_f_score = (2 * training_precisions * training__recall) / ( training_precisions + training__recall) print("Training Precision " + str(training_precisions)) print("Training Recall " + str(training__recall)) print("Training F-Score " + str(training_class_f_score)) print("*" * 50) print("Validating with test set") test_true_positives = 0 test_false_positives = 0 test_false_negative = 0 for i in range(len(test_set_tokens)): predictedClass = naive_bayes.classify(test_set_tokens[i], N_OF_WORDS, N_OF_COMMAS) if VERBOSE: print("Predicted " + test_labels[i] + " as " + predictedClass) if predictedClass == test_labels[i]: test_true_positives += 1 else: test_false_positives += 1 test_false_negative += 1 test_precisions = test_true_positives / ( test_true_positives + test_false_positives) test__recall = test_true_positives / ( test_true_positives + test_false_negative) test_class_f_score = (2 * test_precisions * test__recall) / ( test_precisions + test__recall) print("Test Precision " + str(test_precisions)) print("Test Recall " + str(test__recall)) print("Test F-Score " + str(test_class_f_score))