Example #1
0
def train2():
    import docclass as docclass
    cl1 = docclass.naivebayes(docclass.getwords)
    cl1.setdb('test2.db')
    for a in range(2000):
        docclass.sampletrain(cl1)
    cl1.con.commit()
Example #2
0
def train1():
    import docclass as docclass
    cl = docclass.classifier(docclass.getwords)
    cl.setdb('test1.db')
    for a in range(2000):
        docclass.sampletrain(cl)
    cl.con.commit()
Example #3
0
def train3():
    import docclass as docclass
    cl2 = docclass.fisherclassifier(docclass.getwords)
    cl2.setdb('test3.db')
    for a in range(2000):
        docclass.sampletrain(cl2)
    cl2.con.commit()
def test_fisher_weightedprob():
    sys.stderr.write("testing computation of fisher weightedprob...\n")
    reload(docclass)
    cl=docclass.fisherclassifier(docclass.getwords)
    docclass.sampletrain(cl)
    wp = cl.weightedprob('money', 'bad', cl.cprob)
    sys.stdout.write("%f\n" %(wp)) # 0.75
Example #5
0
 def testProb(self):
   cl = docclass.fisherclassifier(docclass.getwords)
   cl.setdb('test.db')
   docclass.sampletrain(cl)
   self.assertAlmostEquals(0.57142857, cl.cprob('quick', 'good'))
   self.assertAlmostEquals(0.78013987, cl.fisherprob('quick rabbit', 'good'))
   self.assertAlmostEquals(0.35633596, cl.fisherprob('quick rabbit', 'bad'))
def test_fprob():
    sys.stderr.write("testing sampletrain and computation of feature probability...\n")
    reload(docclass)
    cl=docclass.classifier(docclass.getwords)
    docclass.sampletrain(cl)
    fprob = cl.fprob('quick', 'good')
    sys.stdout.write("%f\n" %(fprob)) # 0.6666...
def test_nb_prob():
    sys.stderr.write("testing computation of naive bayes probability...\n")
    reload(docclass)
    cl=docclass.naivebayes(docclass.getwords)
    docclass.sampletrain(cl)
    p1 = cl.prob('quick rabbit', 'good')
    sys.stdout.write("%f\n" %(p1)) # 0.15624999..
    p2 = cl.prob('quick rabbit', 'bad')
    sys.stdout.write("%f\n" %(p2)) # 0.05000000...
def test_fisher_cprob():
    sys.stderr.write("testing computation of fisher cprob...\n")
    reload(docclass)
    cl=docclass.fisherclassifier(docclass.getwords)
    docclass.sampletrain(cl)
    cp1 = cl.cprob('quick', 'good')
    sys.stdout.write("%f\n" %(cp1)) # 0.57142857...
    cp2 = cl.cprob('money', 'bad')
    sys.stdout.write("%f\n" %(cp2)) # 1.0
def test_weightedprob():
    sys.stderr.write("testing computation of weightedprob (probability for unseen words)...\n")
    reload(docclass)
    cl=docclass.classifier(docclass.getwords)
    docclass.sampletrain(cl)
    wp1 = cl.weightedprob('money', 'good', cl.fprob)
    sys.stdout.write("%f\n" %(wp1)) # 0.25
    docclass.sampletrain(cl)
    wp2 = cl.weightedprob('money', 'good', cl.fprob)
    sys.stdout.write("%f\n" %(wp2)) # 0.1666...
Example #10
0
    def upload_csv_button(self):
        self.file_path = tkFileDialog.askopenfilename(
            initialdir='/',
            title='Select file',
            filetypes=(('csv files', '*.csv'), ('all files', '*.*')))

        with open(self.file_path) as f:
            s = f.read()
        my_d = repr(s)
        docclass.getwords(my_d)
        c1 = docclass.classifier(docclass.getwords)
        docclass.sampletrain(c1)
Example #11
0
  def testClassify(self):
    cl = docclass.naivebayes(docclass.getwords)
    cl.setdb('test.db')
    docclass.sampletrain(cl)
    self.assertEquals('good', cl.classify('quick rabbit', default='unknown'))
    self.assertEquals('bad', cl.classify('quick money', default='unknown'))

    cl.setthreshold('bad', 3.0)
    self.assertEquals('unknown', cl.classify('quick money', default='unknown'))

    for i in range(10): docclass.sampletrain(cl)
    self.assertEquals('bad', cl.classify('quick money', default='unknown'))
def test_fisher_fisherprob():
    sys.stderr.write("testing computation of fisher fisherprob...\n")
    reload(docclass)
    cl=docclass.fisherclassifier(docclass.getwords)
    docclass.sampletrain(cl)
    # cprob
    cp = cl.cprob('quick', 'good')
    sys.stdout.write("%f\n" %(cp)) # 0.57142857...
    # fisher prob
    fp1 = cl.fisherprob('quick rabbit', 'good')
    sys.stdout.write("%f\n" %(fp1)) # 0.780139
    fp2 = cl.fisherprob('quick rabbit', 'bad')
    sys.stdout.write("%f\n" %(fp2)) # 0.356335
Example #13
0
  def testClassify(self):
    cl = docclass.fisherclassifier(docclass.getwords)
    cl.setdb('test.db')
    docclass.sampletrain(cl)

    self.assertEquals('good', cl.classify('quick rabbit', default='unknown'))
    self.assertEquals('bad', cl.classify('quick money', default='unknown'))

    cl.setminimum('bad', 0.8)
    self.assertEquals('good', cl.classify('quick money', default='unknown'))

    cl.setminimum('bad', 0.4)
    self.assertEquals('bad', cl.classify('quick money', default='unknown'))
Example #14
0
    def run_spamche_button(self):
        c1 = docclass.naivebayes(docclass.getwords)
        docclass.sampletrain(c1)
        c1.setthreshold('bad', 3.0)
        user_input = self.lbox.get()
        if user_input == '':
            self.var.set('Please load the spam training test!')
        else:
            ans = c1.classify(user_input, default='unkown')
            if ans == 'bad':
                self.var.set('spam')

            else:
                self.var.set('ham')
Example #15
0
def main():
    cl = docclass.classifier(docclass.getwords)
    cl.setdb('test1.db')
    docclass.sampletrain(cl)
    print cl.fprob('quick', 'good')
    print cl.weighted_prob('money', 'good', cl.fprob)
    docclass.sampletrain(cl)
    print cl.weighted_prob('money', 'good', cl.fprob)

    clnb = docclass.naivebayes(docclass.getwords)
    clnb.setdb('test1.db')
    docclass.sampletrain(clnb)
    print clnb.prob('quick rabbit', 'good')
    print clnb.prob('quick rabbit', 'bad')
    print clnb.classify('quick rabbit', default='unknown')
    print clnb.classify('quick money', default='unknown')
    clnb.setthreshold('bad', 3.0)
    print clnb.classify('quick money', default='unknown')

    clfs = docclass.fisherclassifier(docclass.getwords)
    clfs.setdb('test1.db')
    docclass.sampletrain(clfs)
    print clfs.cprob('quick', 'good')
    print clfs.cprob('money', 'bad')
    print clfs.weighted_prob('money', 'bad', clfs.cprob)
    print clfs.fisherprob('quick rabbit', 'good')
    print clfs.fisherprob('quick rabbit', 'bad')
    print clfs.classify('quick rabbit')
    print clfs.classify('quick money')

    clfs2 = docclass.fisherclassifier(docclass.getwords)
    clfs2.setdb('test1.db')
    feedclassifier('feed_sample2.rss', clfs2)
    print clfs2.cprob('Pandas', 'python')
    print clfs2.cprob('python', 'python')
def test_nb_classify():
    sys.stderr.write("testing naive bayes classification...\n")
    reload(docclass)
    cl=docclass.naivebayes(docclass.getwords)
    docclass.sampletrain(cl)
    c1 = cl.classify('quick rabbit', default='unknown')
    sys.stdout.write("%s\n" %(c1)) # 'good'
    c2 = cl.classify('quick money', default='unknown')
    sys.stdout.write("%s\n" %(c2)) # 'bad'
    # test threshold
    cl.setthreshold('bad', 3.0)
    c3 = cl.classify('quick money', default='unknown')
    sys.stdout.write("%s\n" %(c3)) # 'unknown'
    for i in range(10): docclass.sampletrain(cl)
    c4 = cl.classify('quick money', default='unknown')
    sys.stdout.write("%s\n" %(c4)) # 'bad'
def test_fisher_classify():
    sys.stderr.write("testing fisher classification...\n")
    reload(docclass)
    cl=docclass.fisherclassifier(docclass.getwords)
    docclass.sampletrain(cl)
    # classify
    c1 = cl.classify('quick rabbit')
    sys.stdout.write("%s\n" %(c1)) # 'good'
    c2 = cl.classify('quick money')
    sys.stdout.write("%s\n" %(c2)) # 'bad'
    # set minimum for 'bad'
    cl.setminimum('bad', 0.8)
    c3 = cl.classify('quick money')
    sys.stdout.write("%s\n" %(c3)) # 'good'
    # set minimum for 'good'
    cl.setminimum('good', 0.4)
    c4 = cl.classify('quick money')
    sys.stdout.write("%s\n" %(c4)) # 'good'
    def crawl(self, pages, depth=2, maxpages=1000):
        import docclass
        classifier = docclass.naivebayes(docclass.getwords)
        docclass.sampletrain(classifier)
        iter = 0
        for i in range(depth):
            newpages = set()
            for page in pages:
                print iter
                iter = iter + 1
                if (iter > maxpages):
                    return
                try:
                    c = urllib2.urlopen(page)
                except:
                    print "Could not open %s" % page
                    continue
                content = c.read()
                charset = chardet.detect(content[:400])['encoding']
                content = content.decode(charset, "ignore").encode('UTF-8')

                soup = BeautifulSoup(content, 'html.parser')
                self.addtoindex(page, soup, classifier)

                links = soup('a')
                for link in links:
                    #print link
                    if ('href' in dict(link.attrs)):
                        url = urljoin(page, link['href'])
                        if url.find("'") != -1: continue
                        url = url.split('#')[0]  #去掉位置部分
                        if url[0:4] == 'http' and not self.isindexed(url):
                            newpages.add(url)
                        linkText = self.gettextonly(link)
                        self.addlinkref(page, url, linkText)
                self.dbcommit()
            pages = newpages
#-------------------------------------------------------------------------------
# Name:        module1
# Purpose:
#
# Author:      Administrator
#
# Created:     30/10/2012
# Copyright:   (c) Administrator 2012
# Licence:     <your licence>
#-------------------------------------------------------------------------------

import docclass


"""
cl = docclass.classifier(docclass.get_words)

docclass.sampletrain(cl)

print cl.cat_count('good')
print cl.feat_cat_count('money','bad')
print cl.feat_cat_count('money','good')
print cl.fprob('money','good')
print cl.weightedprob('money','good',cl.fprob)

docclass.sampletrain(cl)

print cl.weightedprob('money','good',cl.fprob)
"""

cl = docclass.naivebayes(docclass.get_words)
Example #20
0
import docclass as d

cl = d.fisherclassifier(d.getwords)
d.sampletrain(cl)
print cl.classify('quick rabbit')
print cl.classify('quick money')
cl.setminimum('bad', 0.8)
print cl.classify('quick money')
cl.setminimum('good', 0.4)
print cl.classify('quick money')

for i in range(10):
    d.sampletrain(cl)
print cl.classify('quick money')
Example #21
0
import docclass

###最开始调用getwords进行单词计数的过程中,后来是如何标记每个词分在每一类的呢。

'''###贝叶斯之前 146页为止
c1= docclass.classifier(docclass.getwords)
#c1.train('the quick brown fox jumps over the quick lazy dog','good')
#c1.train('make quick money in the online casino','bad')
#print(c1.fcount('quick','good'))
#print(c1.fcount('quick','bad'))

docclass.sampletrain(c1)
#print(c1.fprob('quick','good'))
print(c1.weightedprob('money','good',c1.fprob))

docclass.sampletrain(c1)
print(c1.weightedprob('money','good',c1.fprob))


##ddd 贝叶斯  到150页结束
c1=docclass.naivebayes(docclass.getwords)
#docclass.sampletrain(c1)
#print(c1.prob('quick rabbit','good'))
#print(c1.prob('quick rabbit','bad'))

#print(c1.classify('owns water',default='unknown'))
#print(c1.classify('quick money',default='unknown'))

c1.setthreshold('bad',5.0)#这个后面的数字是设定出现多少次会被认为是符合‘bad’条件,而不是后续的‘unknown’
c1.setthreshold('good',4.0)
Example #22
0
 def testProb(self):
   cl = docclass.naivebayes(docclass.getwords)
   cl.setdb('test.db')
   docclass.sampletrain(cl)
   self.assertAlmostEquals(0.15624999, cl.prob('good', 'quick rabbit'))
   self.assertAlmostEquals(0.05, cl.prob('bad', 'quick rabbit'))
Example #23
0
# print cl.prob('quick money', 'bad')
# for i in range(10):
#     docclass.sampletrain(cl)
# print cl.classify('quick money', defalt='unknow')
# print cl.prob('quick money', 'good')
# print cl.prob('quick money', 'bad')

# cl=fisherClassifier.FisherClassifier(docclass.getwords)
# docclass.sampletrain(cl)
# print cl.cprob('quick', 'good')
# print cl.cprob('money', 'bad')

# cl=fisherClassifier.FisherClassifier(docclass.getwords)
# docclass.sampletrain(cl)
# print cl.cprob('quick', 'good')
# print cl.fisherprob('quick rabbit', 'good')
# print cl.fisherprob('quick rabbit', 'bad')

# cl=fisherClassifier.FisherClassifier(docclass.getwords)
# docclass.sampletrain(cl)
# print cl.classify('quick rabbit')
# print cl.classify("quick money")
# cl.setminimum('bad', 0.8)
# print cl.classify("quick money")

cl = fisherClassifier.FisherClassifier(docclass.getwords)
cl.setdb("test1.db")
docclass.sampletrain(cl)
cl2 = naivebayes.naivebayes(docclass.getwords)
cl2.setdb("test1.db")
print cl2.classify('quick money')
Example #24
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import docclass

cl = docclass.naivebayes(docclass.getwords)
docclass.sampletrain(cl)
print(cl.classify('quick rabbit', default='unknown'))
print(cl.classify('quick money', default='unknown'))

cl.setthreshold('bad', 3.0)
print(cl.classify('quick money', default='unknown'))

for i in range(10): docclass.sampletrain(cl)
print(cl.classify('quick money', default='unknown'))
 def setUp(self):
     docclass.sampletrain(self.client)
 def setUp(self):
     docclass.sampletrain(self.client)