def train2(): import docclass as docclass cl1 = docclass.naivebayes(docclass.getwords) cl1.setdb('test2.db') for a in range(2000): docclass.sampletrain(cl1) cl1.con.commit()
def train1(): import docclass as docclass cl = docclass.classifier(docclass.getwords) cl.setdb('test1.db') for a in range(2000): docclass.sampletrain(cl) cl.con.commit()
def train3(): import docclass as docclass cl2 = docclass.fisherclassifier(docclass.getwords) cl2.setdb('test3.db') for a in range(2000): docclass.sampletrain(cl2) cl2.con.commit()
def test_fisher_weightedprob(): sys.stderr.write("testing computation of fisher weightedprob...\n") reload(docclass) cl=docclass.fisherclassifier(docclass.getwords) docclass.sampletrain(cl) wp = cl.weightedprob('money', 'bad', cl.cprob) sys.stdout.write("%f\n" %(wp)) # 0.75
def testProb(self): cl = docclass.fisherclassifier(docclass.getwords) cl.setdb('test.db') docclass.sampletrain(cl) self.assertAlmostEquals(0.57142857, cl.cprob('quick', 'good')) self.assertAlmostEquals(0.78013987, cl.fisherprob('quick rabbit', 'good')) self.assertAlmostEquals(0.35633596, cl.fisherprob('quick rabbit', 'bad'))
def test_fprob(): sys.stderr.write("testing sampletrain and computation of feature probability...\n") reload(docclass) cl=docclass.classifier(docclass.getwords) docclass.sampletrain(cl) fprob = cl.fprob('quick', 'good') sys.stdout.write("%f\n" %(fprob)) # 0.6666...
def test_nb_prob(): sys.stderr.write("testing computation of naive bayes probability...\n") reload(docclass) cl=docclass.naivebayes(docclass.getwords) docclass.sampletrain(cl) p1 = cl.prob('quick rabbit', 'good') sys.stdout.write("%f\n" %(p1)) # 0.15624999.. p2 = cl.prob('quick rabbit', 'bad') sys.stdout.write("%f\n" %(p2)) # 0.05000000...
def test_fisher_cprob(): sys.stderr.write("testing computation of fisher cprob...\n") reload(docclass) cl=docclass.fisherclassifier(docclass.getwords) docclass.sampletrain(cl) cp1 = cl.cprob('quick', 'good') sys.stdout.write("%f\n" %(cp1)) # 0.57142857... cp2 = cl.cprob('money', 'bad') sys.stdout.write("%f\n" %(cp2)) # 1.0
def test_weightedprob(): sys.stderr.write("testing computation of weightedprob (probability for unseen words)...\n") reload(docclass) cl=docclass.classifier(docclass.getwords) docclass.sampletrain(cl) wp1 = cl.weightedprob('money', 'good', cl.fprob) sys.stdout.write("%f\n" %(wp1)) # 0.25 docclass.sampletrain(cl) wp2 = cl.weightedprob('money', 'good', cl.fprob) sys.stdout.write("%f\n" %(wp2)) # 0.1666...
def upload_csv_button(self): self.file_path = tkFileDialog.askopenfilename( initialdir='/', title='Select file', filetypes=(('csv files', '*.csv'), ('all files', '*.*'))) with open(self.file_path) as f: s = f.read() my_d = repr(s) docclass.getwords(my_d) c1 = docclass.classifier(docclass.getwords) docclass.sampletrain(c1)
def testClassify(self): cl = docclass.naivebayes(docclass.getwords) cl.setdb('test.db') docclass.sampletrain(cl) self.assertEquals('good', cl.classify('quick rabbit', default='unknown')) self.assertEquals('bad', cl.classify('quick money', default='unknown')) cl.setthreshold('bad', 3.0) self.assertEquals('unknown', cl.classify('quick money', default='unknown')) for i in range(10): docclass.sampletrain(cl) self.assertEquals('bad', cl.classify('quick money', default='unknown'))
def test_fisher_fisherprob(): sys.stderr.write("testing computation of fisher fisherprob...\n") reload(docclass) cl=docclass.fisherclassifier(docclass.getwords) docclass.sampletrain(cl) # cprob cp = cl.cprob('quick', 'good') sys.stdout.write("%f\n" %(cp)) # 0.57142857... # fisher prob fp1 = cl.fisherprob('quick rabbit', 'good') sys.stdout.write("%f\n" %(fp1)) # 0.780139 fp2 = cl.fisherprob('quick rabbit', 'bad') sys.stdout.write("%f\n" %(fp2)) # 0.356335
def testClassify(self): cl = docclass.fisherclassifier(docclass.getwords) cl.setdb('test.db') docclass.sampletrain(cl) self.assertEquals('good', cl.classify('quick rabbit', default='unknown')) self.assertEquals('bad', cl.classify('quick money', default='unknown')) cl.setminimum('bad', 0.8) self.assertEquals('good', cl.classify('quick money', default='unknown')) cl.setminimum('bad', 0.4) self.assertEquals('bad', cl.classify('quick money', default='unknown'))
def run_spamche_button(self): c1 = docclass.naivebayes(docclass.getwords) docclass.sampletrain(c1) c1.setthreshold('bad', 3.0) user_input = self.lbox.get() if user_input == '': self.var.set('Please load the spam training test!') else: ans = c1.classify(user_input, default='unkown') if ans == 'bad': self.var.set('spam') else: self.var.set('ham')
def main(): cl = docclass.classifier(docclass.getwords) cl.setdb('test1.db') docclass.sampletrain(cl) print cl.fprob('quick', 'good') print cl.weighted_prob('money', 'good', cl.fprob) docclass.sampletrain(cl) print cl.weighted_prob('money', 'good', cl.fprob) clnb = docclass.naivebayes(docclass.getwords) clnb.setdb('test1.db') docclass.sampletrain(clnb) print clnb.prob('quick rabbit', 'good') print clnb.prob('quick rabbit', 'bad') print clnb.classify('quick rabbit', default='unknown') print clnb.classify('quick money', default='unknown') clnb.setthreshold('bad', 3.0) print clnb.classify('quick money', default='unknown') clfs = docclass.fisherclassifier(docclass.getwords) clfs.setdb('test1.db') docclass.sampletrain(clfs) print clfs.cprob('quick', 'good') print clfs.cprob('money', 'bad') print clfs.weighted_prob('money', 'bad', clfs.cprob) print clfs.fisherprob('quick rabbit', 'good') print clfs.fisherprob('quick rabbit', 'bad') print clfs.classify('quick rabbit') print clfs.classify('quick money') clfs2 = docclass.fisherclassifier(docclass.getwords) clfs2.setdb('test1.db') feedclassifier('feed_sample2.rss', clfs2) print clfs2.cprob('Pandas', 'python') print clfs2.cprob('python', 'python')
def test_nb_classify(): sys.stderr.write("testing naive bayes classification...\n") reload(docclass) cl=docclass.naivebayes(docclass.getwords) docclass.sampletrain(cl) c1 = cl.classify('quick rabbit', default='unknown') sys.stdout.write("%s\n" %(c1)) # 'good' c2 = cl.classify('quick money', default='unknown') sys.stdout.write("%s\n" %(c2)) # 'bad' # test threshold cl.setthreshold('bad', 3.0) c3 = cl.classify('quick money', default='unknown') sys.stdout.write("%s\n" %(c3)) # 'unknown' for i in range(10): docclass.sampletrain(cl) c4 = cl.classify('quick money', default='unknown') sys.stdout.write("%s\n" %(c4)) # 'bad'
def test_fisher_classify(): sys.stderr.write("testing fisher classification...\n") reload(docclass) cl=docclass.fisherclassifier(docclass.getwords) docclass.sampletrain(cl) # classify c1 = cl.classify('quick rabbit') sys.stdout.write("%s\n" %(c1)) # 'good' c2 = cl.classify('quick money') sys.stdout.write("%s\n" %(c2)) # 'bad' # set minimum for 'bad' cl.setminimum('bad', 0.8) c3 = cl.classify('quick money') sys.stdout.write("%s\n" %(c3)) # 'good' # set minimum for 'good' cl.setminimum('good', 0.4) c4 = cl.classify('quick money') sys.stdout.write("%s\n" %(c4)) # 'good'
def crawl(self, pages, depth=2, maxpages=1000): import docclass classifier = docclass.naivebayes(docclass.getwords) docclass.sampletrain(classifier) iter = 0 for i in range(depth): newpages = set() for page in pages: print iter iter = iter + 1 if (iter > maxpages): return try: c = urllib2.urlopen(page) except: print "Could not open %s" % page continue content = c.read() charset = chardet.detect(content[:400])['encoding'] content = content.decode(charset, "ignore").encode('UTF-8') soup = BeautifulSoup(content, 'html.parser') self.addtoindex(page, soup, classifier) links = soup('a') for link in links: #print link if ('href' in dict(link.attrs)): url = urljoin(page, link['href']) if url.find("'") != -1: continue url = url.split('#')[0] #去掉位置部分 if url[0:4] == 'http' and not self.isindexed(url): newpages.add(url) linkText = self.gettextonly(link) self.addlinkref(page, url, linkText) self.dbcommit() pages = newpages
#------------------------------------------------------------------------------- # Name: module1 # Purpose: # # Author: Administrator # # Created: 30/10/2012 # Copyright: (c) Administrator 2012 # Licence: <your licence> #------------------------------------------------------------------------------- import docclass """ cl = docclass.classifier(docclass.get_words) docclass.sampletrain(cl) print cl.cat_count('good') print cl.feat_cat_count('money','bad') print cl.feat_cat_count('money','good') print cl.fprob('money','good') print cl.weightedprob('money','good',cl.fprob) docclass.sampletrain(cl) print cl.weightedprob('money','good',cl.fprob) """ cl = docclass.naivebayes(docclass.get_words)
import docclass as d cl = d.fisherclassifier(d.getwords) d.sampletrain(cl) print cl.classify('quick rabbit') print cl.classify('quick money') cl.setminimum('bad', 0.8) print cl.classify('quick money') cl.setminimum('good', 0.4) print cl.classify('quick money') for i in range(10): d.sampletrain(cl) print cl.classify('quick money')
import docclass ###最开始调用getwords进行单词计数的过程中,后来是如何标记每个词分在每一类的呢。 '''###贝叶斯之前 146页为止 c1= docclass.classifier(docclass.getwords) #c1.train('the quick brown fox jumps over the quick lazy dog','good') #c1.train('make quick money in the online casino','bad') #print(c1.fcount('quick','good')) #print(c1.fcount('quick','bad')) docclass.sampletrain(c1) #print(c1.fprob('quick','good')) print(c1.weightedprob('money','good',c1.fprob)) docclass.sampletrain(c1) print(c1.weightedprob('money','good',c1.fprob)) ##ddd 贝叶斯 到150页结束 c1=docclass.naivebayes(docclass.getwords) #docclass.sampletrain(c1) #print(c1.prob('quick rabbit','good')) #print(c1.prob('quick rabbit','bad')) #print(c1.classify('owns water',default='unknown')) #print(c1.classify('quick money',default='unknown')) c1.setthreshold('bad',5.0)#这个后面的数字是设定出现多少次会被认为是符合‘bad’条件,而不是后续的‘unknown’ c1.setthreshold('good',4.0)
def testProb(self): cl = docclass.naivebayes(docclass.getwords) cl.setdb('test.db') docclass.sampletrain(cl) self.assertAlmostEquals(0.15624999, cl.prob('good', 'quick rabbit')) self.assertAlmostEquals(0.05, cl.prob('bad', 'quick rabbit'))
# print cl.prob('quick money', 'bad') # for i in range(10): # docclass.sampletrain(cl) # print cl.classify('quick money', defalt='unknow') # print cl.prob('quick money', 'good') # print cl.prob('quick money', 'bad') # cl=fisherClassifier.FisherClassifier(docclass.getwords) # docclass.sampletrain(cl) # print cl.cprob('quick', 'good') # print cl.cprob('money', 'bad') # cl=fisherClassifier.FisherClassifier(docclass.getwords) # docclass.sampletrain(cl) # print cl.cprob('quick', 'good') # print cl.fisherprob('quick rabbit', 'good') # print cl.fisherprob('quick rabbit', 'bad') # cl=fisherClassifier.FisherClassifier(docclass.getwords) # docclass.sampletrain(cl) # print cl.classify('quick rabbit') # print cl.classify("quick money") # cl.setminimum('bad', 0.8) # print cl.classify("quick money") cl = fisherClassifier.FisherClassifier(docclass.getwords) cl.setdb("test1.db") docclass.sampletrain(cl) cl2 = naivebayes.naivebayes(docclass.getwords) cl2.setdb("test1.db") print cl2.classify('quick money')
#!/usr/bin/env python # -*- coding: utf-8 -*- import docclass cl = docclass.naivebayes(docclass.getwords) docclass.sampletrain(cl) print(cl.classify('quick rabbit', default='unknown')) print(cl.classify('quick money', default='unknown')) cl.setthreshold('bad', 3.0) print(cl.classify('quick money', default='unknown')) for i in range(10): docclass.sampletrain(cl) print(cl.classify('quick money', default='unknown'))
def setUp(self): docclass.sampletrain(self.client)