from naive_bayes_classifier import NaiveBayesClassifer import sys model = NaiveBayesClassifer() if sys.argv[1]: model.train_all(sys.argv[1]) print 'Done' else: print 'Must provide a path to input'
preprocess = PreprocessData() #vocab = preprocess.load_pickle_file('vocab') vocab = preprocess.generate_vocabulary(train_data_test) print(len(np.unique(vocab))) print("Normalizing train data") train_data = preprocess.normalize_text(train_data, 'train_data_normalized') train_data_test = preprocess.normalize_text(train_data_test, 'train_data_normalized') print("Normalizing test data") validation_data = preprocess.normalize_text(validation_data, 'validation_data_normalized') test_data = preprocess.normalize_text(test_data, 'test_data_normalized', train=False) naive_classifier = NaiveBayesClassifer(train_data_test, vocab) naive_classifier.train(train_data_test) alphas = np.linspace(0.09, 1, 20) # Best alpha submit 55.211% 0.09063157894736842 accuracies = [] for alpha in alphas: preds = naive_classifier.test_accuracy(validation_data[:, :-1], True, alpha) accuracy = np.mean(preds == validation_data[:, -1]) accuracies.append(accuracy) print("accuracy {0} for {1}".format(accuracy, str(alpha))) print("Best alpha: ", alphas[np.argmax(accuracies)]) #vocab = preprocess.generate_vocabulary(train_data) #naive_classifier_test = NaiveBayesClassifer(train_data, vocab) #naive_classifier_test.train(train_data) predictions = naive_classifier.test_accuracy(test_data, False,
from naive_bayes_classifier import NaiveBayesClassifer import sys model = NaiveBayesClassifer(load = True) if sys.argv[1]: model.predict_all(sys.argv[1]) print 'Done' else: print 'Must provide a path to input'