def main(): cl = docclass.classifier(docclass.getwords) cl.setdb('test1.db') docclass.sampletrain(cl) print cl.fprob('quick', 'good') print cl.weighted_prob('money', 'good', cl.fprob) docclass.sampletrain(cl) print cl.weighted_prob('money', 'good', cl.fprob) clnb = docclass.naivebayes(docclass.getwords) clnb.setdb('test1.db') docclass.sampletrain(clnb) print clnb.prob('quick rabbit', 'good') print clnb.prob('quick rabbit', 'bad') print clnb.classify('quick rabbit', default='unknown') print clnb.classify('quick money', default='unknown') clnb.setthreshold('bad', 3.0) print clnb.classify('quick money', default='unknown') clfs = docclass.fisherclassifier(docclass.getwords) clfs.setdb('test1.db') docclass.sampletrain(clfs) print clfs.cprob('quick', 'good') print clfs.cprob('money', 'bad') print clfs.weighted_prob('money', 'bad', clfs.cprob) print clfs.fisherprob('quick rabbit', 'good') print clfs.fisherprob('quick rabbit', 'bad') print clfs.classify('quick rabbit') print clfs.classify('quick money') clfs2 = docclass.fisherclassifier(docclass.getwords) clfs2.setdb('test1.db') feedclassifier('feed_sample2.rss', clfs2) print clfs2.cprob('Pandas', 'python') print clfs2.cprob('python', 'python')
def guess_the_prof(self): if self.method=='naive': cl=docclass.naivebayes(docclass.getwords) prof_sel=self.box.get() # This is the professor whose department we want to guess doc_of_prof=self.profs_data[prof_sel] self.trainer(prof_sel,cl) all_thresh=self.lb.get(0,END) thresholds=[] for item in all_thresh: merged=item.split('-') threshnum=float(merged[0]) thresh=merged[1] thresholds.append((thresh,threshnum)) for thr,num in thresholds: cl.setthreshold(thr,num) self.pdep= cl.classify(doc_of_prof,default='unknown') else: cl=docclass.fisherclassifier(docclass.getwords) prof_sel=self.box.get() # This is the professor whose department we want to guess doc_of_prof=self.profs_data[prof_sel] self.trainer(prof_sel,cl) all_thresh=self.lb.get(0,END) thresholds=[] for item in all_thresh: merged=item.split('-') threshnum=float(merged[0]) thresh=merged[1] thresholds.append((thresh,threshnum)) for thr,num in thresholds: cl.setminimum(thr,num) self.pdep= cl.classify(doc_of_prof,default='unknown') self.verdict()
def train2(): import docclass as docclass cl1 = docclass.naivebayes(docclass.getwords) cl1.setdb('test2.db') for a in range(2000): docclass.sampletrain(cl1) cl1.con.commit()
def fetch(self): # fetching phase self.color.config(background='yellow') self.course_list = [] self.update() url = self.entry.get() self.dataobj = Data() # creating data obj self.dataobj.init_data(url) self.courses = self.dataobj.courselist.keys() # getting keys self.courses.sort() # sorting keys self.obj_list = [] for i in self.courses: self.obj_list.append(self.dataobj.courselist[i]) self.classifier_obj = docclass.naivebayes(docclass.getwords) for i in self.obj_list: # TRANING PHASE self.classifier_obj.train(i.split_name.lower(), i.first_code) r1 = re.compile("(.*?)\s*\(") for i in self.courses: # adding courses to listbox course_name = self.dataobj.courselist[i].name name = r1.match(course_name) if name != None: name1 = i + '' + '(' + name.group(1) + ')' else: name1 = i + ' ' + '(' + course_name + ')' self.coursesListbox.insert(END, name1) for z in self.courses: # adding course category to other listbox if self.dataobj.courselist[z].first_code not in self.course_list: self.course_list.append(self.dataobj.courselist[z].first_code) code = self.dataobj.courselist[z].first_code if code not in self.programsListbox.get(0, END): self.programsListbox.insert(END, code) self.color.config(background='green') self.update() self.coursesListbox.bind('<<ListboxSelect>>', self.estimate) self.programsListbox.bind('<<ListboxSelect>>', self.analyze)
def train_classifier(self): # Trainer naive_obj = docclass.naivebayes( docclass.getwords) # Creates Naive Object for prof, pub in self.faculty_members.items(): for publication in pub.publications: naive_obj.train( publication, prof ) # Trains the object with the pubication and professor return naive_obj
def test_nb_prob(): sys.stderr.write("testing computation of naive bayes probability...\n") reload(docclass) cl=docclass.naivebayes(docclass.getwords) docclass.sampletrain(cl) p1 = cl.prob('quick rabbit', 'good') sys.stdout.write("%f\n" %(p1)) # 0.15624999.. p2 = cl.prob('quick rabbit', 'bad') sys.stdout.write("%f\n" %(p2)) # 0.05000000...
def testClassify(self): cl = docclass.naivebayes(docclass.getwords) cl.setdb('test.db') docclass.sampletrain(cl) self.assertEquals('good', cl.classify('quick rabbit', default='unknown')) self.assertEquals('bad', cl.classify('quick money', default='unknown')) cl.setthreshold('bad', 3.0) self.assertEquals('unknown', cl.classify('quick money', default='unknown')) for i in range(10): docclass.sampletrain(cl) self.assertEquals('bad', cl.classify('quick money', default='unknown'))
def run_spamche_button(self): c1 = docclass.naivebayes(docclass.getwords) docclass.sampletrain(c1) c1.setthreshold('bad', 3.0) user_input = self.lbox.get() if user_input == '': self.var.set('Please load the spam training test!') else: ans = c1.classify(user_input, default='unkown') if ans == 'bad': self.var.set('spam') else: self.var.set('ham')
def test_nb_classify(): sys.stderr.write("testing naive bayes classification...\n") reload(docclass) cl=docclass.naivebayes(docclass.getwords) docclass.sampletrain(cl) c1 = cl.classify('quick rabbit', default='unknown') sys.stdout.write("%s\n" %(c1)) # 'good' c2 = cl.classify('quick money', default='unknown') sys.stdout.write("%s\n" %(c2)) # 'bad' # test threshold cl.setthreshold('bad', 3.0) c3 = cl.classify('quick money', default='unknown') sys.stdout.write("%s\n" %(c3)) # 'unknown' for i in range(10): docclass.sampletrain(cl) c4 = cl.classify('quick money', default='unknown') sys.stdout.write("%s\n" %(c4)) # 'bad'
def predict(event): """predicts results by training classifier then calls printOut function""" cl = docclass.naivebayes(docclass.getwords) # training classifier for i in self.trainer: if self.trainer[i][0] !="" and self.trainer[i][1] !="": cl.train(self.trainer[i][0], self.trainer[i][1]) # predicting grades for other courses for i in self.guess: if self.guess[i][0] !="": self.guess[i][1] = cl.classify(self.guess[i][0], default="unknown") # printing predicted results to text widget self.printOut()
def fetch(self): self.fetcher.fetch_members() self.members = self.fetcher.fetch_member_publications() self.cl = docclass.naivebayes(docclass.getwords) for mem, obj in self.members.items(): for pub in obj.publication: self.cl.train(pub, mem) self.projects = self.fetcher.fetch_projects() projects = self.projects.keys() projects.sort() for i in projects: self.project_listbox.insert(END, i)
def predict(event): """predicts results by training classifier then calls printOut function""" cl = docclass.naivebayes(docclass.getwords) # training classifier for i in self.trainer: if self.trainer[i][0] != "" and self.trainer[i][1] != "": cl.train(self.trainer[i][0], self.trainer[i][1]) # predicting grades for other courses for i in self.guess: if self.guess[i][0] != "": self.guess[i][1] = cl.classify(self.guess[i][0], default="unknown") # printing predicted results to text widget self.printOut()
def Bayes_prediction(self): cl = docclass.naivebayes(docclass.getwords) for category in self.list_of_department: ##Category=departmnet for teacher in self.dictionary_of_department_and_professor[ category]: if teacher == self.combovar.get(): continue else: cl.train( self.dictionary_as_database[teacher], category ) #self.dictionary_as_database[teacher]=professor's information for item in self.list_of_thresholds: department, score = item cl.setthreshold(department, score) prediction = cl.classify( self.dictionary_as_database[self.combovar.get()], default=None) self.help_to_write(prediction)
def bayesianClassification(): print '## Bayesian Classification' allw,artw,artt,wordmatrix,wordvec=readpickle() def wordmatrixfeatures(x): return [wordvec[w] for w in range(len(x)) if x[w]>0] wordmatrixfeatures(wordmatrix[0]) #sys.path.append("../chapter6-filtering") import docclass classifier=docclass.naivebayes(wordmatrixfeatures) classifier.setdb('newstest.db') print artt[0].encode("utf-8") # Train this as an 'iraq' story classifier.train(wordmatrix[0],'iraq') print artt[1].encode("utf-8") # Train this as an 'india' story classifier.train(wordmatrix[1],'india') print artt[2].encode("utf-8") # How is this story classified? print classifier.classify(wordmatrix[1])
def crawl(self, pages, depth=2, maxpages=1000): import docclass classifier = docclass.naivebayes(docclass.getwords) docclass.sampletrain(classifier) iter = 0 for i in range(depth): newpages = set() for page in pages: print iter iter = iter + 1 if (iter > maxpages): return try: c = urllib2.urlopen(page) except: print "Could not open %s" % page continue content = c.read() charset = chardet.detect(content[:400])['encoding'] content = content.decode(charset, "ignore").encode('UTF-8') soup = BeautifulSoup(content, 'html.parser') self.addtoindex(page, soup, classifier) links = soup('a') for link in links: #print link if ('href' in dict(link.attrs)): url = urljoin(page, link['href']) if url.find("'") != -1: continue url = url.split('#')[0] #去掉位置部分 if url[0:4] == 'http' and not self.isindexed(url): newpages.add(url) linkText = self.gettextonly(link) self.addlinkref(page, url, linkText) self.dbcommit() pages = newpages
def train_naivebayes(train_path, test_path): cl = docclass.naivebayes(docclass.getwords) for filename in glob.glob(train_path): with open(filename, 'r') as f: f = f.read() label = filename.split('.')[3] cl.train(f, label) print "Train Done!" TP = 0.0 FN = 0.0 FP = 0.0 TN = 0.0 cl.setthreshold('ham', 2.4) for filename in glob.glob(test_path): with open(filename, 'r') as f: f = f.read() label = filename.split('.')[3] predict = cl.classify(f, default='Unknown') if label == 'spam' and predict == 'spam': TP += 1 elif label == 'spam' and predict == 'ham': FN += 1 elif label == 'ham' and predict == 'spam': FP += 1 elif label == 'ham' and predict == 'ham': TN += 1 else: print predict, label print "Test Done!" score(TP, FN, FP, TN)
def testProb(self): cl = docclass.naivebayes(docclass.getwords) cl.setdb('test.db') docclass.sampletrain(cl) self.assertAlmostEquals(0.15624999, cl.prob('good', 'quick rabbit')) self.assertAlmostEquals(0.05, cl.prob('bad', 'quick rabbit'))
cl = docclass.classifier(docclass.get_words) docclass.sampletrain(cl) print cl.cat_count('good') print cl.feat_cat_count('money','bad') print cl.feat_cat_count('money','good') print cl.fprob('money','good') print cl.weightedprob('money','good',cl.fprob) docclass.sampletrain(cl) print cl.weightedprob('money','good',cl.fprob) """ cl = docclass.naivebayes(docclass.get_words) docclass.sampletrain(cl) print cl.cat_count('good') print cl.feat_cat_count('money','bad') print cl.feat_cat_count('money','good') print cl.fprob('money','good') print 'test' print cl.weightedprob('money','good',cl.fprob) print cl.weightedprob('money','good',cl.fprob) print cl.prob('quick rabbit','good') print cl.prob('QUICK RABBIT','good') print cl.classify('quick rabbit',default = 'unkown')
print "original: " + str(emotion) for emotion_group in emotion_groups.keys(): if emotion_groups[emotion_group].has_key(emotion): grouped_emotion = emotion_group print "updated: " + grouped_emotion print "get rid of removed!!!" if grouped_emotion != 'remove': emotion_list.append([text, emotion, grouped_emotion]) return emotion_list if __name__ == "__main__": #run3() fs = fetchBasicFeatureSet() processed_fs = processed_fs(fs) classifier = nbc.naivebayes(nbc.getwords) classifier.setdb(); for item in processed_fs: text = item[0] emotion = item[1] grouped_emotion = item[2] classifier.train(text, grouped_emotion) #print classifier.classify("L'amour s'en va comme cette eau courante L'amour s'en va")
import docclass as dc classifier = dc.naivebayes(dc.getwords) #classifier.setdb() #works! #classifier.train("hello world","test_category")
import docclass as dc import os training_path_spam = "training/spam/" training_path_not_spam = "training/not_spam/" testing_path_spam = "testing/spam/" testing_path_not_spam = "testing/not_spam/" train_set = dc.naivebayes(dc.getwords) try: os.remove('mack.db') # deletes old database except: pass # don't freak out if there's not already a database to delete train_set.setdb('mack.db') #creates fresh database def train(filepath, desig): #desig is 'spam' or 'not spam' file_list = os.listdir(filepath) for each in file_list: test_string = "" for line in open(filepath + each, 'r'): if line != '\n': train_set.train(line, desig) def test(filepath): file_list = os.listdir(filepath) for each in file_list: test_string = "" for line in open(filepath + each, 'r'): if line != '\n':
def setUpClass(cls): cls.client = docclass.naivebayes(docclass.getwords) cls.db = cls.client.setdb(TEST_DATABASE)
def train_classifier( self): # crates and trains a naive bayes classifier object self.classifier = docclass.naivebayes(docclass.getwords) for member in self.faculty_members: # training the naive bayes classifier object with the publications of each member for publication in self.faculty_members[member].publications: self.classifier.train(publication, member)
def kk(): for i in range(4): print(i) if i+5>6: return i return i+1 a=kk() print(a)''' ## 课后习题2 c2=docclass.naivebayes(docclass.getwords) #docclass.sampletrain(c2) c1=docclass.noignore(docclass.getwords) #docclass.sampletrain(c2) print('hhh') #print(c2.prob('quick','good')) #print(c1.prob('quick rabbit','bad')) docclass.sampletrain(c1) print(c1.prob('quick','good')) #print(c1.prob('quick rabbit','bad')) ''' #print(c1.classify('owns water',default='unknown')) #print(c1.classify('quick money',default='unknown'))
def __init__(self,classifier_dict,feed_queue): self.con = psycopg2.connect("dbname='nlptweets' host='localhost' user='******' password='******'") self.cur = self.con.cursor() self.classifier = nbc.naivebayes(nbc.getwords,classifier_dict) self.feed_queue = feed_queue threading.Thread.__init__(self)
def naive_bayes(self): self.my_train = docclass.naivebayes(docclass.getwords) for key in self.courses_grades: if len(self.courses_grades[key]) == 3 and type(self.courses_grades[key][2])==unicode: self.my_train.train(self.courses_grades[key][2], self.courses_grades[key][0]) #course description as doc and course grade as class self.my_train.setthreshold(self.courses_grades[key][0], 1.0) #as mentioned in the assignment, I set the threshold to 1
import docclass from subprocess import check_output cl = docclass.naivebayes(docclass.getwords) #remove previous db file cl.setdb('hrishi.db') docclass.eTrain(cl) for i in range(11,21): filename = 'test/spam' + str(i) +'.txt' with open(filename, 'r', encoding='utf-8') as testFile: print(filename, cl.classify(testFile.read())) for i in range(11,21): filename = 'test/nonspam' + str(i) +'.txt' with open(filename, 'r', encoding='utf-8') as testFile1: print(filename, cl.classify(testFile1.read())) #classify text: "the banking dinner" as spam or not spam #print( cl.classify('the banking dinner') )