Beispiel #1
0
def main():
    cl = docclass.classifier(docclass.getwords)
    cl.setdb('test1.db')
    docclass.sampletrain(cl)
    print cl.fprob('quick', 'good')
    print cl.weighted_prob('money', 'good', cl.fprob)
    docclass.sampletrain(cl)
    print cl.weighted_prob('money', 'good', cl.fprob)

    clnb = docclass.naivebayes(docclass.getwords)
    clnb.setdb('test1.db')
    docclass.sampletrain(clnb)
    print clnb.prob('quick rabbit', 'good')
    print clnb.prob('quick rabbit', 'bad')
    print clnb.classify('quick rabbit', default='unknown')
    print clnb.classify('quick money', default='unknown')
    clnb.setthreshold('bad', 3.0)
    print clnb.classify('quick money', default='unknown')

    clfs = docclass.fisherclassifier(docclass.getwords)
    clfs.setdb('test1.db')
    docclass.sampletrain(clfs)
    print clfs.cprob('quick', 'good')
    print clfs.cprob('money', 'bad')
    print clfs.weighted_prob('money', 'bad', clfs.cprob)
    print clfs.fisherprob('quick rabbit', 'good')
    print clfs.fisherprob('quick rabbit', 'bad')
    print clfs.classify('quick rabbit')
    print clfs.classify('quick money')

    clfs2 = docclass.fisherclassifier(docclass.getwords)
    clfs2.setdb('test1.db')
    feedclassifier('feed_sample2.rss', clfs2)
    print clfs2.cprob('Pandas', 'python')
    print clfs2.cprob('python', 'python')
Beispiel #2
0
def train1():
    import docclass as docclass
    cl = docclass.classifier(docclass.getwords)
    cl.setdb('test1.db')
    for a in range(2000):
        docclass.sampletrain(cl)
    cl.con.commit()
def test_fprob():
    sys.stderr.write("testing sampletrain and computation of feature probability...\n")
    reload(docclass)
    cl=docclass.classifier(docclass.getwords)
    docclass.sampletrain(cl)
    fprob = cl.fprob('quick', 'good')
    sys.stdout.write("%f\n" %(fprob)) # 0.6666...
def test_train():
    sys.stderr.write("testing training...\n")
    cl=docclass.classifier(docclass.getwords)
    cl.train('the quick brown fox jumps over the lazy dog', 'good')
    cl.train('make quick money in the online casino', 'bad')
    fcount1 = cl.fcount('quick', 'good')
    sys.stdout.write("%f\n" %(fcount1)) # 1.0
    fcount2 = cl.fcount('quick', 'bad')
    sys.stdout.write("%f\n" %(fcount2)) # 1.0
def test_weightedprob():
    sys.stderr.write("testing computation of weightedprob (probability for unseen words)...\n")
    reload(docclass)
    cl=docclass.classifier(docclass.getwords)
    docclass.sampletrain(cl)
    wp1 = cl.weightedprob('money', 'good', cl.fprob)
    sys.stdout.write("%f\n" %(wp1)) # 0.25
    docclass.sampletrain(cl)
    wp2 = cl.weightedprob('money', 'good', cl.fprob)
    sys.stdout.write("%f\n" %(wp2)) # 0.1666...
Beispiel #6
0
    def upload_csv_button(self):
        self.file_path = tkFileDialog.askopenfilename(
            initialdir='/',
            title='Select file',
            filetypes=(('csv files', '*.csv'), ('all files', '*.*')))

        with open(self.file_path) as f:
            s = f.read()
        my_d = repr(s)
        docclass.getwords(my_d)
        c1 = docclass.classifier(docclass.getwords)
        docclass.sampletrain(c1)
Beispiel #7
0
def FisherClassifier(fn):
 cl=docclass.classifier(docclass.getwords)
 cl.setdb('A10.db')
 stig = {}


 #get ratings into dictionary to be used later with entries
 with open('rated.txt') as rated:
     rated = rated.readlines()
     for entry in rated:
         entry = str(entry).split(" ")
         key = entry[0]
         val = entry[1].encode('utf-8').strip('\n')
         stig[key]=val

 #get all words for each title
 with open(fn) as entries:
  doc = entries.readlines()
  i = 0
  for line in doc:
    title = line.split(" ")
    a = title[0]
    title.pop(0)
    title.pop(0)
    if a in stig.keys():
     #train classifier using first 50 manually rated entries
     if i < 50:
      titlestr =''
      for word in title:
          word = word + ' '
          titlestr= titlestr + word
      category = stig[a]
      cl.train(titlestr,category)
      i+=1
     else:
      for word in title:
          word = word + ' '
          titlestr = titlestr + word
      print "\n"
      print cl.fprob(titlestr, 'A')
      print "\n"
Beispiel #8
0
def FisherClassifier(fn):
    cl = docclass.classifier(docclass.getwords)
    cl.setdb('A10.db')
    stig = {}

    #get ratings into dictionary to be used later with entries
    with open('rated.txt') as rated:
        rated = rated.readlines()
        for entry in rated:
            entry = str(entry).split(" ")
            key = entry[0]
            val = entry[1].encode('utf-8').strip('\n')
            stig[key] = val

    #get all words for each title
    with open(fn) as entries:
        doc = entries.readlines()
        i = 0
        for line in doc:
            title = line.split(" ")
            a = title[0]
            title.pop(0)
            title.pop(0)
            if a in stig.keys():
                #train classifier using first 50 manually rated entries
                if i < 50:
                    titlestr = ''
                    for word in title:
                        word = word + ' '
                        titlestr = titlestr + word
                    category = stig[a]
                    cl.train(titlestr, category)
                    i += 1
                else:
                    for word in title:
                        word = word + ' '
                        titlestr = titlestr + word
                    print "\n"
                    print cl.fprob(titlestr, 'A')
                    print "\n"
Beispiel #9
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import docclass

cl = docclass.classifier(docclass.getwords)
docclass.sampletrain(cl)
print(cl.weightedprob('money', 'good', cl.fprob))
docclass.sampletrain(cl)
print(cl.weightedprob('money', 'good', cl.fprob))
Beispiel #10
0
import docclass
reload(docclass)

cl = docclass.classifier(docclass.getwords)
docclass.sampledata(cl)
print(cl.weightedprob("money", "good", cl.fprob))


Beispiel #11
0
 def testBasic(self):
   cl = docclass.classifier(docclass.getwords)
   cl.setdb('test.db')
   cl.train('spam spam spam', 'bad')
   self.assertEquals(1.0, cl.fprob('spam', 'bad'))
Beispiel #12
0
    def data_Fetch_Train(self):
        self.parent.geometry("780x595+200+50")

        # GUI For Fetching and Training

        self.frame3 = Frame(self.parent, bg='wheat2')
        self.frame3.grid(row=2, column=0, sticky=W, stick=W)

        self.label6 = Label(self.frame3,
                            text="Individual Books",
                            fg='black',
                            font=("Helvetica", 12),
                            bg="wheat2")
        self.label6.grid(row=0, column=0, padx=5, stick=S, sticky=S)

        self.label7 = Label(self.frame3,
                            text="Top 3 Estimates ",
                            fg='black',
                            font=("Helvetica", 12),
                            bg="wheat2")
        self.label7.grid(row=0, column=1, padx=5, stick=S, sticky=S)

        self.listBox1 = Listbox(self.frame3,
                                width=50,
                                height=10,
                                font=("Helvetica", 8))
        self.listBox1.grid(row=1, rowspan=5, column=0, pady=5, padx=140)
        self.listBox1.bind('<<ListboxSelect>>', self.listClicked)

        self.labelp1 = Label(self.frame3, text="one ", font=("Helvetica", 10))
        self.labelp1.grid(row=2, column=1, padx=5)

        self.labelp2 = Label(self.frame3, text=" Two", font=("Helvetica", 10))
        self.labelp2.grid(row=3, column=1, padx=5)

        self.labelp3 = Label(self.frame3,
                             text=" Three",
                             font=("Helvetica", 10))
        self.labelp3.grid(row=4, column=1, padx=5)

        ################# Analysis part GUI

        self.frame4 = Frame(self.parent, bg='wheat2')
        self.frame4.grid(row=3, column=0)

        self.l8 = Label(self.frame4,
                        text='Accuracy analysis based \n on Genres: ',
                        font=("Helvetica", 10),
                        bg='wheat2')
        self.l8.grid(row=0, column=0)

        self.listbox2 = Listbox(self.frame4, height=8)
        self.listbox2.grid(row=1, column=0)
        self.listbox2.bind('<<ListboxSelect>>', self.analysis)

        self.text = Text(self.frame4, height=8, width=30)
        self.text.grid(row=1, column=1)

        # Fetching part

        self.clasifier = docclass.classifier(docclass.getwords)
        data = self.GetgenresFromFile()
        data = [data[0], data[1]]
        self.BooksDATA = self.Crawler(data)
        print self.BooksDATA

        # Training Part

        self.listboxinfo = []
        for key, value in self.BooksDATA.items():
            # print key
            self.listbox2.insert(END, key)
            for val in value:
                print key, " --> ", val[1]
                self.listboxinfo.append(val[0] + '- ' + val[1] + '-' + key)
                self.clasifier.train(val[1], key)

        random.shuffle(self.listboxinfo, random.random)
        for i in self.listboxinfo:
            self.listBox1.insert(END, i)
        self.label5.config(text="< Trained >", fg='black', bg='Gold')
Beispiel #13
0
 def test_incf(self):
     clas = docclass.classifier(docclass.getwords)
     origin_str = 'the quick brown for jumps over the lazy dog'
     clas.train('the quick brown for jumps over the lazy dog', 'good')
     str_tobe = docclass.getwords(origin_str)
     self.assertDictEqual(clas.fc, str_tobe)
#!/usr/bin/env python

import docclass

classifier = docclass.classifier()

classifier.train('the quick brown fox jumps over the lazy dog', 'good')
classifier.train('make quick money in the online casino', 'bad')

print classifier.get_word_count_per_tag('quick', 'good')
print classifier.get_word_count_per_tag('quick', 'bad')

Beispiel #15
0
 def test_incf(self):
     clas = docclass.classifier(docclass.getwords)
     origin_str = 'the quick brown for jumps over the lazy dog'
     clas.train('the quick brown for jumps over the lazy dog', 'good')
     str_tobe = docclass.getwords(origin_str)
     self.assertDictEqual(clas.fc, str_tobe)