def run(self): if not self.usedev: for grams in self.allgrams: c = NaiveBayesClassifier(self.rawfname, grams=grams) c.trainClassifier() self.stdout = True self.evaluate(c) return for grams in self.allgrams: c = NaiveBayesClassifier(self.rawfname, grams=grams) c.trainClassifier() for w in self.allweights: c.setWeight(w) for t1 in self.allthresholds: for t2 in self.allthresholds: c.setThresholds(neg=t1, pos=t2) cinfo, accpos, accneg, accall, corrall = self.evaluate( c) self.results.append( [cinfo, accpos, accneg, accall, corrall]) if self.csvout: self.flushToCSV()
def assignment_e_naivebayes_2(): # Use these throughout below. These are really language-specific functions, so it's a huge # simplification to use these for a language identifier. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() results = [] # Callback for receiving results. Received scores are log-probabilities. def match_collector(match: dict): results.append(match) print("*** WINNER", match["score"], match["category"]) # Replicate Example 13.1 on pages 241 and 242 in the textbook. china = InMemoryCorpus() china.add_document(InMemoryDocument(0, {"body": "Chinese Beijing Chinese"})) china.add_document( InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"})) china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"})) not_china = InMemoryCorpus() not_china.add_document(InMemoryDocument(0, {"body": "Tokyo Japan Chinese"})) training_set = {"china": china, "not china": not_china} classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) buffer = "Chinese Chinese Chinese Tokyo Japan" print(buffer) results.clear() classifier.classify(buffer, match_collector) assert len(results) == 2 assert results[0]["category"] == "china" assert results[1]["category"] == "not china" assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001) assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
def run(self): for grams in self.allgrams: c = NaiveBayesClassifier(self.rawfname, grams=grams) c.trainClassifier() self.stdout = False return self.evaluate(c) for grams in self.allgrams: c = NaiveBayesClassifier(self.rawfname, grams=grams) c.trainClassifier() for w in self.allweights: c.setWeight(w) for t1 in self.allthresholds: for t2 in self.allthresholds: c.setThresholds(neg=t1, pos=t2) cinfo, accpos, accneg, accall, corrall = self.evaluate(c) self.results.append([cinfo, accpos, accneg, accall, corrall])
def main(): import os.path from normalization import BrainDeadNormalizer from tokenization import BrainDeadTokenizer from corpus import InMemoryCorpus from naivebayesclassifier import NaiveBayesClassifier print("Initializing naive Bayes classifier from news corpora...") normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() languages = ["en", "no", "da", "de"] training_set = {language: InMemoryCorpus(os.path.join(data_path,f"{language}.txt")) for language in languages} classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) print(f"Enter some text and classify it into {languages}.") print(f"Returned scores are log-probabilities.") def evaluator(text): results = [] classifier.classify(text, lambda m: results.append(m)) return results simple_repl("text", evaluator)
def test_language_detection_trained_on_some_news_corpora(self): import os.path from corpus import InMemoryCorpus from naivebayesclassifier import NaiveBayesClassifier training_set = { language: InMemoryCorpus(os.path.join(data_path, f"{language}.txt")) for language in ["en", "no", "da", "de"] } classifier = NaiveBayesClassifier(training_set, ["body"], self._normalizer, self._tokenizer) self._classify_buffer_and_verify_top_categories( "Vil det riktige språket identifiseres? Dette er bokmål.", classifier, ["no"]) self._classify_buffer_and_verify_top_categories( "I don't believe that the number of tokens exceeds a billion.", classifier, ["en"]) self._classify_buffer_and_verify_top_categories( "De danske drenge drikker snaps!", classifier, ["da"]) self._classify_buffer_and_verify_top_categories( "Der Kriminalpolizei! Haben sie angst?", classifier, ["de"])
class NaiveBayesClassifierTest(unittest.TestCase): def test_predict(self): STOP_WORDS = set(line.strip().decode('utf-8') for line in open("stopwords.dic", 'r')) def tokenize(text): try: seg_list = jieba.cut(text, cut_all=False) return set( [x.strip() for x in seg_list if x not in STOP_WORDS]) except Exception, e: print e return [] classifier = NaiveBayesClassifier(tokenizer=tokenize) # classifier.fit(u'naive_train_data') # classifier.dump('naive_classifier.dat') classifier.load('naive_classifier.dat') classifier.reduce(400) start = time() total = 0.0 errors = 0.0 for root, dirs, files in os.walk(u'naive_test_data/', topdown=True): for name in files: if root.startswith('.') or name.startswith('.'): continue category = root.split('/')[-1] text = open(os.path.join(root, name), 'r').read().decode('utf-8') predict = classifier.predict(text) total += 1 if category != predict: errors += 1 print 'predict: %s, actual: %s, errors percentage: %0.2f' % ( predict.encode('utf-8'), category.encode('utf-8'), 100 * errors / total) print 'testing completed, total: %d, errors: %d, error rate:%0.2f, costs: %0.2f' % ( total, errors, 100 * errors / total, time() - start) return errors / total
def assignment_e_naivebayes_1(): # Use these throughout below. These are really language-specific functions, so it's a huge # simplification to use these for a language identifier. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() results = [] # Callback for receiving results. Received scores are log-probabilities. def match_collector(match: dict): results.append(match) print("*** WINNER", match["score"], match["category"]) # Use this as the training set for our language identifier. print("LOADING...") training_set = { language: InMemoryCorpus("data/" + language + ".txt") for language in ["en", "no", "da", "de"] } # Assess probabilities from the training set. print("TRAINING...") classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) # Classify some previously unseen text fragments. print("CLASSIFYING...") for (buffer, language) in [ ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.", "no"), ("I don't believe that the number of tokens exceeds a billion.", "en"), ("De danske drenge drikker snaps!", "da"), ("Der Kriminalpolizei! Haben sie angst?", "de") ]: print(buffer) results.clear() classifier.classify(buffer, match_collector) assert results[0]["category"] == language
def test_china_example_from_textbook(self): import math from corpus import InMemoryDocument, InMemoryCorpus from naivebayesclassifier import NaiveBayesClassifier china = InMemoryCorpus() china.add_document( InMemoryDocument(0, {"body": "Chinese Beijing Chinese"})) china.add_document( InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"})) china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"})) not_china = InMemoryCorpus() not_china.add_document( InMemoryDocument(0, {"body": "Tokyo Japan Chinese"})) training_set = {"china": china, "not china": not_china} classifier = NaiveBayesClassifier(training_set, ["body"], self._normalizer, self._tokenizer) results = [] classifier.classify("Chinese Chinese Chinese Tokyo Japan", lambda m: results.append(m)) self.assertEqual(len(results), 2) self.assertEqual(results[0]["category"], "china") self.assertAlmostEqual(math.exp(results[0]["score"]), 0.0003, 4) self.assertEqual(results[1]["category"], "not china") self.assertAlmostEqual(math.exp(results[1]["score"]), 0.0001, 4)
from maxentclassifier import MaximumEntropyClassifier from naivebayesclassifier import NaiveBayesClassifier import random import csv fname = 'training.csv' nb = NaiveBayesClassifier(fname, grams=[1, 2]) nb.setThresholds(neg=1.0, pos=20.0) nb.setWeight(0.000000000005) nb.trainClassifier() ment = MaximumEntropyClassifier(fname) ment.trainClassifier() classifiers = [nb, ment] def csvdata_to_list(data): d=[] for row in data: d.append(row) return d def search(text,data): output = [] i=0 for d in data: if d[0].lower().find(text) != -1: output.append([]) output[i].append(d[0])
processed = re.sub(r'—', r"-", line) processed = re.sub(r'([^\w\s\'])', r' \1 ', line) processed = processed.lower() return (processed.split()) #End def parser = argparse.ArgumentParser() parser.add_argument('train', help='The filename that points to training set.') parser.add_argument('test', help='The filename that points to test set.') args = parser.parse_args() # Train our classifier nbc = NaiveBayesClassifier(featurizer, classer, (AGREE_CLASS, DISAGREE_CLASS)) with open(args.train, 'r', encoding='UTF-8') as csv_train: train_reader = csv.reader(csv_train, delimiter=',') next(train_reader) for row in train_reader: rating = float(row[1]) if rating >= -1 and rating < 1: continue nbc.add_sample(row) #End with nbc.smooth() false_counts = Counter() true_counts = Counter() real_counts = Counter()
def assignment_e(): # Use these throughout below. These are really language-specific functions, so it's a huge # simplification to use these for a language identifier. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() results = [] # Callback for receiving results. Received scores are log-probabilities. def match_collector(match: dict): results.append(match) print("*** WINNER", match["score"], match["category"]) # Use this as the training set for our language identifier. print("LOADING...") training_set = { language: InMemoryCorpus("data/" + language + ".txt") for language in ["en", "no", "da", "de"] } # Assess probabilities from the training set. print("TRAINING...") classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) # Classify some previously unseen text fragments. print("CLASSIFYING...") for (buffer, language) in [ ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.", "no"), ("I don't believe that the number of tokens exceeds a billion.", "en"), ("De danske drenge drikker snaps!", "da"), ("Der Kriminalpolizei! Haben sie angst?", "de") ]: print(buffer) results.clear() classifier.classify(buffer, match_collector) assert results[0]["category"] == language # For demonstration purposes, replicate Example 13.1 on pages 241 and 242 in the textbook. china = InMemoryCorpus() china.add_document(InMemoryDocument(0, {"body": "Chinese Beijing Chinese"})) china.add_document( InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"})) china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"})) not_china = InMemoryCorpus() not_china.add_document(InMemoryDocument(0, {"body": "Tokyo Japan Chinese"})) training_set = {"china": china, "not china": not_china} classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) buffer = "Chinese Chinese Chinese Tokyo Japan" print(buffer) results.clear() classifier.classify(buffer, match_collector) assert len(results) == 2 assert results[0]["category"] == "china" assert results[1]["category"] == "not china" assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001) assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)