def load_classifier(): if not os.path.exists('classifier.db'): classifier = train_all(None , corpus) classifier.save('classifier.db') return Classifier.load('classifier.db')
def load_classifier(): if not os.path.exists('classifier.db'): sys.stderr.write('Unable to load classifier.db -- please run this script first') sys.exit(1) return Classifier.load('classifier.db')
subj = "" for line in text.splitlines(): if line.startswith("subject:"): is_subj = True subj = line[8:] else: lines.append(line) return email_extract(subj, " ".join(lines), min_len, max_len) def test_enron_files(classifier, path, label, selector=None): files = get_file_list(path, selector) correct = total = 0 for filename in files: with open(os.path.join(path, filename)) as fh: contents = fh.read() features = enron_email_extract(contents) res = classifier.classify(features) best = res[0][0] if best == label: correct += 1 total += 1 pct = 100 * (float(correct) / total) print 'Accuracy of "%s": %s%% based on %s documents' % (label, pct, total) if __name__ == "__main__": classifier = Classifier.load("classifier.bin") for d, l in get_dir_and_labels(): test_enron_files(classifier, d, l)