def run(self): if not self.usedev: for grams in self.allgrams: c = NaiveBayesClassifier(self.rawfname, grams=grams) c.trainClassifier() self.stdout = True self.evaluate(c) return for grams in self.allgrams: c = NaiveBayesClassifier(self.rawfname, grams=grams) c.trainClassifier() for w in self.allweights: c.setWeight(w) for t1 in self.allthresholds: for t2 in self.allthresholds: c.setThresholds(neg=t1, pos=t2) cinfo, accpos, accneg, accall, corrall = self.evaluate(c) self.results.append([cinfo, accpos, accneg, accall, corrall]) if self.csvout: self.flushToCSV()
def assignment_e_naivebayes_2(): # Use these throughout below. These are really language-specific functions, so it's a huge # simplification to use these for a language identifier. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() results = [] # Callback for receiving results. Received scores are log-probabilities. def match_collector(match: dict): results.append(match) print("*** WINNER", match["score"], match["category"]) # Replicate Example 13.1 on pages 241 and 242 in the textbook. china = InMemoryCorpus() china.add_document(InMemoryDocument(0, {"body": "Chinese Beijing Chinese"})) china.add_document( InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"})) china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"})) not_china = InMemoryCorpus() not_china.add_document(InMemoryDocument(0, {"body": "Tokyo Japan Chinese"})) training_set = {"china": china, "not china": not_china} classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) buffer = "Chinese Chinese Chinese Tokyo Japan" print(buffer) results.clear() classifier.classify(buffer, match_collector) assert len(results) == 2 assert results[0]["category"] == "china" assert results[1]["category"] == "not china" assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001) assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
def classify_tweets(request): consumer_key="Wb4W1n264iHhcrqcXt54bA" consumer_secret="2NFs7pO610XKQUOs5hPAz8wCEO4uxmP3111HPhsmgc" access_token="36641014-28RR3YAp6MxFxJ706gsp5a7bRy0sYDsjLCwixs2iM" access_token_secret="qOGQg84VvurJKX9qSF3Zgl973BxF6ryt7Yruoxtw" auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) query = request.POST.get('query') result=api.search(query) tweets=[] classification=[] for tweet in result: try: tweets.append(str(tweet.text)) except: pass posScore=0 negScore=0 for tweet in tweets: tokens=tweet.split() data_preprocess.remove_noise_words(tokens) data_preprocess.remove_names(tokens) data_preprocess.remove_links(tokens) tweet_counts=[] token_counts=[] category_counts=defaultdict(lambda:defaultdict(int)) p=tweet_category_count.objects.get(id=1) tweet_counts.append(p.positive_count) tweet_counts.append(p.negative_count) p=token_category_count.objects.get(id=1) token_counts.append(p.positive_count) token_counts.append(p.negative_count) for token in tokens: try: p=pos_tokens.objects.get(ptoken=token) category_counts[token]['pos']=p.pcount except: category_counts[token]['pos']=0 for token in tokens: try: p=neg_tokens.objects.get(ntoken=token) category_counts[token]['neg']=p.ncount except: category_counts[token]['neg']=0 classifier=NaiveBayesClassifier() result=classifier.classify(tokens,category_counts,tweet_counts,token_counts) if(result=='pos'): posScore+=1 else: negScore+=1 classification.append(result) return render_to_response("index.html",{'tweets':tweets,'pos_neg':classification,'posScore':posScore,'negScore':negScore})
def assignment_e_naivebayes_1(): # Use these throughout below. These are really language-specific functions, so it's a huge # simplification to use these for a language identifier. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() results = [] # Callback for receiving results. Received scores are log-probabilities. def match_collector(match: dict): results.append(match) print("*** WINNER", match["score"], match["category"]) # Use this as the training set for our language identifier. print("LOADING...") training_set = { language: InMemoryCorpus("data/" + language + ".txt") for language in ["en", "no", "da", "de"] } # Assess probabilities from the training set. print("TRAINING...") classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) # Classify some previously unseen text fragments. print("CLASSIFYING...") for (buffer, language) in [ ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.", "no"), ("I don't believe that the number of tokens exceeds a billion.", "en"), ("De danske drenge drikker snaps!", "da"), ("Der Kriminalpolizei! Haben sie angst?", "de") ]: print(buffer) results.clear() classifier.classify(buffer, match_collector) assert results[0]["category"] == language
def test_china_example_from_textbook(self): import math from corpus import InMemoryDocument, InMemoryCorpus from naivebayesclassifier import NaiveBayesClassifier china = InMemoryCorpus() china.add_document( InMemoryDocument(0, {"body": "Chinese Beijing Chinese"})) china.add_document( InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"})) china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"})) not_china = InMemoryCorpus() not_china.add_document( InMemoryDocument(0, {"body": "Tokyo Japan Chinese"})) training_set = {"china": china, "not china": not_china} classifier = NaiveBayesClassifier(training_set, ["body"], self._normalizer, self._tokenizer) results = [] classifier.classify("Chinese Chinese Chinese Tokyo Japan", lambda m: results.append(m)) self.assertEqual(len(results), 2) self.assertEqual(results[0]["category"], "china") self.assertAlmostEqual(math.exp(results[0]["score"]), 0.0003, 4) self.assertEqual(results[1]["category"], "not china") self.assertAlmostEqual(math.exp(results[1]["score"]), 0.0001, 4)
def main(): import os.path from normalization import BrainDeadNormalizer from tokenization import BrainDeadTokenizer from corpus import InMemoryCorpus from naivebayesclassifier import NaiveBayesClassifier print("Initializing naive Bayes classifier from news corpora...") normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() languages = ["en", "no", "da", "de"] training_set = {language: InMemoryCorpus(os.path.join(data_path,f"{language}.txt")) for language in languages} classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) print(f"Enter some text and classify it into {languages}.") print(f"Returned scores are log-probabilities.") def evaluator(text): results = [] classifier.classify(text, lambda m: results.append(m)) return results simple_repl("text", evaluator)
def test_language_detection_trained_on_some_news_corpora(self): import os.path from corpus import InMemoryCorpus from naivebayesclassifier import NaiveBayesClassifier training_set = { language: InMemoryCorpus(os.path.join(data_path, f"{language}.txt")) for language in ["en", "no", "da", "de"] } classifier = NaiveBayesClassifier(training_set, ["body"], self._normalizer, self._tokenizer) self._classify_buffer_and_verify_top_categories( "Vil det riktige språket identifiseres? Dette er bokmål.", classifier, ["no"]) self._classify_buffer_and_verify_top_categories( "I don't believe that the number of tokens exceeds a billion.", classifier, ["en"]) self._classify_buffer_and_verify_top_categories( "De danske drenge drikker snaps!", classifier, ["da"]) self._classify_buffer_and_verify_top_categories( "Der Kriminalpolizei! Haben sie angst?", classifier, ["de"])
class NaiveBayesClassifierTest(unittest.TestCase): def test_predict(self): STOP_WORDS = set(line.strip().decode('utf-8') for line in open("stopwords.dic", 'r')) def tokenize(text): try: seg_list = jieba.cut(text, cut_all=False) return set( [x.strip() for x in seg_list if x not in STOP_WORDS]) except Exception, e: print e return [] classifier = NaiveBayesClassifier(tokenizer=tokenize) # classifier.fit(u'naive_train_data') # classifier.dump('naive_classifier.dat') classifier.load('naive_classifier.dat') classifier.reduce(400) start = time() total = 0.0 errors = 0.0 for root, dirs, files in os.walk(u'naive_test_data/', topdown=True): for name in files: if root.startswith('.') or name.startswith('.'): continue category = root.split('/')[-1] text = open(os.path.join(root, name), 'r').read().decode('utf-8') predict = classifier.predict(text) total += 1 if category != predict: errors += 1 print 'predict: %s, actual: %s, errors percentage: %0.2f' % ( predict.encode('utf-8'), category.encode('utf-8'), 100 * errors / total) print 'testing completed, total: %d, errors: %d, error rate:%0.2f, costs: %0.2f' % ( total, errors, 100 * errors / total, time() - start) return errors / total
def run(self): for grams in self.allgrams: c = NaiveBayesClassifier(self.rawfname, grams=grams) c.trainClassifier() self.stdout = False return self.evaluate(c) for grams in self.allgrams: c = NaiveBayesClassifier(self.rawfname, grams=grams) c.trainClassifier() for w in self.allweights: c.setWeight(w) for t1 in self.allthresholds: for t2 in self.allthresholds: c.setThresholds(neg=t1, pos=t2) cinfo, accpos, accneg, accall, corrall = self.evaluate(c) self.results.append([cinfo, accpos, accneg, accall, corrall])
from maxentclassifier import MaximumEntropyClassifier from naivebayesclassifier import NaiveBayesClassifier import random import csv fname = 'training.csv' nb = NaiveBayesClassifier(fname, grams=[1, 2]) nb.setThresholds(neg=1.0, pos=20.0) nb.setWeight(0.000000000005) nb.trainClassifier() ment = MaximumEntropyClassifier(fname) ment.trainClassifier() classifiers = [nb, ment] def csvdata_to_list(data): d=[] for row in data: d.append(row) return d def search(text,data): output = [] i=0 for d in data: if d[0].lower().find(text) != -1: output.append([]) output[i].append(d[0])
''' import tornado.ioloop import tornado.web import urllib import tweepy import os from maxentclassifier import MaximumEntropyClassifier from naivebayesclassifier import NaiveBayesClassifier # name of training set file fname = 'trainingandtestdata/training.csv' # train classifiers here first nb = NaiveBayesClassifier(fname, grams=[1,2]) nb.setThresholds(neg=1.0, pos=20.0) nb.setWeight(0.000000000005) nb.trainClassifier() ment = MaximumEntropyClassifier(fname) ment.trainClassifier() classifiers = [nb, ment] class MainHandler(tornado.web.RequestHandler): ''' Handles request to main page ''' def get(self): query = self.get_argument("query", "").strip() cchosen = int(self.get_argument("classifier-type", 0))
processed = re.sub(r'—', r"-", line) processed = re.sub(r'([^\w\s\'])', r' \1 ', line) processed = processed.lower() return (processed.split()) #End def parser = argparse.ArgumentParser() parser.add_argument('train', help='The filename that points to training set.') parser.add_argument('test', help='The filename that points to test set.') args = parser.parse_args() # Train our classifier nbc = NaiveBayesClassifier(featurizer, classer, (AGREE_CLASS, DISAGREE_CLASS)) with open(args.train, 'r', encoding='UTF-8') as csv_train: train_reader = csv.reader(csv_train, delimiter=',') next(train_reader) for row in train_reader: rating = float(row[1]) if rating >= -1 and rating < 1: continue nbc.add_sample(row) #End with nbc.smooth() false_counts = Counter() true_counts = Counter() real_counts = Counter()
def assignment_e(): # Use these throughout below. These are really language-specific functions, so it's a huge # simplification to use these for a language identifier. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() results = [] # Callback for receiving results. Received scores are log-probabilities. def match_collector(match: dict): results.append(match) print("*** WINNER", match["score"], match["category"]) # Use this as the training set for our language identifier. print("LOADING...") training_set = { language: InMemoryCorpus("data/" + language + ".txt") for language in ["en", "no", "da", "de"] } # Assess probabilities from the training set. print("TRAINING...") classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) # Classify some previously unseen text fragments. print("CLASSIFYING...") for (buffer, language) in [ ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.", "no"), ("I don't believe that the number of tokens exceeds a billion.", "en"), ("De danske drenge drikker snaps!", "da"), ("Der Kriminalpolizei! Haben sie angst?", "de") ]: print(buffer) results.clear() classifier.classify(buffer, match_collector) assert results[0]["category"] == language # For demonstration purposes, replicate Example 13.1 on pages 241 and 242 in the textbook. china = InMemoryCorpus() china.add_document(InMemoryDocument(0, {"body": "Chinese Beijing Chinese"})) china.add_document( InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"})) china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"})) not_china = InMemoryCorpus() not_china.add_document(InMemoryDocument(0, {"body": "Tokyo Japan Chinese"})) training_set = {"china": china, "not china": not_china} classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) buffer = "Chinese Chinese Chinese Tokyo Japan" print(buffer) results.clear() classifier.classify(buffer, match_collector) assert len(results) == 2 assert results[0]["category"] == "china" assert results[1]["category"] == "not china" assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001) assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
def run(self): if not self.usedev: for grams in self.allgrams: c = NaiveBayesClassifier(self.rawfname, grams=grams) c.trainClassifier() self.stdout = True self.evaluate(c) return for grams in self.allgrams: c = NaiveBayesClassifier(self.rawfname, grams=grams) c.trainClassifier() for w in self.allweights: c.setWeight(w) for t1 in self.allthresholds: for t2 in self.allthresholds: c.setThresholds(neg=t1, pos=t2) cinfo, accpos, accneg, accall, corrall = self.evaluate( c) self.results.append( [cinfo, accpos, accneg, accall, corrall]) if self.csvout: self.flushToCSV()
from pymongo import MongoClient from bson.objectid import ObjectId import jieba import re from bs4 import BeautifulSoup from naivebayesclassifier import NaiveBayesClassifier from weighter.informationgain import InformationGain if __name__ == '__main__': STOP_WORDS = set(line.strip().decode('utf-8') for line in open("stopwords.dic", 'r')) def tokenize(text): try: # print text.encode('utf-8') seg_list = jieba.cut(text, cut_all=False) zh_vocabulaly = re.compile(ur"([\u4E00-\u9FA5]+$)") return [x.strip() for x in seg_list if zh_vocabulaly.match(x) and x not in STOP_WORDS] except Exception, e: print e return [] client = MongoClient() documents = client.rss.documents classifier = NaiveBayesClassifier(tokenizer=tokenize) classifier.load(u'raw_features_1.dat') classifier.reduce(max_size=404, weighter=InformationGain) for x in documents.find({},{'_id':'1','content':1}): content = BeautifulSoup(x['content']).text.encode('utf-8') category = classifier.predict_text(content) print category.encode('utf-8') documents.update({'_id': x['_id']},{'$set': {'category':category}})
#!/usr/bin/env python # -*- coding: utf-8 -*- from naivebayesclassifier import NaiveBayesClassifier from cmd import Cmd l = NaiveBayesClassifier('data') while True: name = raw_input() name_unicode = name.decode('utf-8') final_p = l.classify(name_unicode[-1], force_class_average = True) best_p = 0 best_ans = -1 for i in final_p: if final_p[i] > best_p: best_p = final_p[i] best_ans = i print status[best_ans], best_p if name == 'exit': break