def classify(text, sender=None, subject=None): training_set = load_training_set() classifier = NaiveBayesClassifier.train(training_set) test_data = bag_of_words(extract_bigrams(text)) if sender is not None: test_data[sender] = True if subject is not None: test_data[subject] = True classified = classifier.prob_classify(test_data) pprint({categories[sample]: classified.prob(sample) for sample in classified.samples()}) return categories[classified.max()]
def cross_validate(): training_set = load_training_set() random.shuffle(training_set) average = 0 cv = KFold(len(training_set), n_folds=10, indices=True, shuffle=False, random_state=None) for traincv, evalcv in cv: classifier = NaiveBayesClassifier.train(training_set[traincv[0]:traincv[len(traincv) - 1]]) acc = accuracy(classifier, training_set[evalcv[0]:evalcv[len(evalcv) - 1]]) print 'Range: ', evalcv[0], 'to', evalcv[len(evalcv) - 1] print 'Accuracy: %4.2f' % acc average += acc print 'Average accuracy: %4.2f' % (average / 10)
def classify(text, sender=None, subject=None): training_set = load_training_set() classifier = NaiveBayesClassifier.train(training_set) test_data = bag_of_words(extract_bigrams(text)) if sender is not None: test_data[sender] = True if subject is not None: test_data[subject] = True classified = classifier.prob_classify(test_data) pprint({ categories[sample]: classified.prob(sample) for sample in classified.samples() }) return categories[classified.max()]
def cross_validate(): training_set = load_training_set() random.shuffle(training_set) average = 0 cv = KFold(len(training_set), n_folds=10, indices=True, shuffle=False, random_state=None) for traincv, evalcv in cv: classifier = NaiveBayesClassifier.train( training_set[traincv[0]:traincv[len(traincv) - 1]]) acc = accuracy(classifier, training_set[evalcv[0]:evalcv[len(evalcv) - 1]]) print 'Range: ', evalcv[0], 'to', evalcv[len(evalcv) - 1] print 'Accuracy: %4.2f' % acc average += acc print 'Average accuracy: %4.2f' % (average / 10)