def main(): cl = docclass.classifier(docclass.getwords) cl.setdb('test1.db') docclass.sampletrain(cl) print cl.fprob('quick', 'good') print cl.weighted_prob('money', 'good', cl.fprob) docclass.sampletrain(cl) print cl.weighted_prob('money', 'good', cl.fprob) clnb = docclass.naivebayes(docclass.getwords) clnb.setdb('test1.db') docclass.sampletrain(clnb) print clnb.prob('quick rabbit', 'good') print clnb.prob('quick rabbit', 'bad') print clnb.classify('quick rabbit', default='unknown') print clnb.classify('quick money', default='unknown') clnb.setthreshold('bad', 3.0) print clnb.classify('quick money', default='unknown') clfs = docclass.fisherclassifier(docclass.getwords) clfs.setdb('test1.db') docclass.sampletrain(clfs) print clfs.cprob('quick', 'good') print clfs.cprob('money', 'bad') print clfs.weighted_prob('money', 'bad', clfs.cprob) print clfs.fisherprob('quick rabbit', 'good') print clfs.fisherprob('quick rabbit', 'bad') print clfs.classify('quick rabbit') print clfs.classify('quick money') clfs2 = docclass.fisherclassifier(docclass.getwords) clfs2.setdb('test1.db') feedclassifier('feed_sample2.rss', clfs2) print clfs2.cprob('Pandas', 'python') print clfs2.cprob('python', 'python')
def train1(): import docclass as docclass cl = docclass.classifier(docclass.getwords) cl.setdb('test1.db') for a in range(2000): docclass.sampletrain(cl) cl.con.commit()
def test_fprob(): sys.stderr.write("testing sampletrain and computation of feature probability...\n") reload(docclass) cl=docclass.classifier(docclass.getwords) docclass.sampletrain(cl) fprob = cl.fprob('quick', 'good') sys.stdout.write("%f\n" %(fprob)) # 0.6666...
def test_train(): sys.stderr.write("testing training...\n") cl=docclass.classifier(docclass.getwords) cl.train('the quick brown fox jumps over the lazy dog', 'good') cl.train('make quick money in the online casino', 'bad') fcount1 = cl.fcount('quick', 'good') sys.stdout.write("%f\n" %(fcount1)) # 1.0 fcount2 = cl.fcount('quick', 'bad') sys.stdout.write("%f\n" %(fcount2)) # 1.0
def test_weightedprob(): sys.stderr.write("testing computation of weightedprob (probability for unseen words)...\n") reload(docclass) cl=docclass.classifier(docclass.getwords) docclass.sampletrain(cl) wp1 = cl.weightedprob('money', 'good', cl.fprob) sys.stdout.write("%f\n" %(wp1)) # 0.25 docclass.sampletrain(cl) wp2 = cl.weightedprob('money', 'good', cl.fprob) sys.stdout.write("%f\n" %(wp2)) # 0.1666...
def upload_csv_button(self): self.file_path = tkFileDialog.askopenfilename( initialdir='/', title='Select file', filetypes=(('csv files', '*.csv'), ('all files', '*.*'))) with open(self.file_path) as f: s = f.read() my_d = repr(s) docclass.getwords(my_d) c1 = docclass.classifier(docclass.getwords) docclass.sampletrain(c1)
def FisherClassifier(fn): cl=docclass.classifier(docclass.getwords) cl.setdb('A10.db') stig = {} #get ratings into dictionary to be used later with entries with open('rated.txt') as rated: rated = rated.readlines() for entry in rated: entry = str(entry).split(" ") key = entry[0] val = entry[1].encode('utf-8').strip('\n') stig[key]=val #get all words for each title with open(fn) as entries: doc = entries.readlines() i = 0 for line in doc: title = line.split(" ") a = title[0] title.pop(0) title.pop(0) if a in stig.keys(): #train classifier using first 50 manually rated entries if i < 50: titlestr ='' for word in title: word = word + ' ' titlestr= titlestr + word category = stig[a] cl.train(titlestr,category) i+=1 else: for word in title: word = word + ' ' titlestr = titlestr + word print "\n" print cl.fprob(titlestr, 'A') print "\n"
def FisherClassifier(fn): cl = docclass.classifier(docclass.getwords) cl.setdb('A10.db') stig = {} #get ratings into dictionary to be used later with entries with open('rated.txt') as rated: rated = rated.readlines() for entry in rated: entry = str(entry).split(" ") key = entry[0] val = entry[1].encode('utf-8').strip('\n') stig[key] = val #get all words for each title with open(fn) as entries: doc = entries.readlines() i = 0 for line in doc: title = line.split(" ") a = title[0] title.pop(0) title.pop(0) if a in stig.keys(): #train classifier using first 50 manually rated entries if i < 50: titlestr = '' for word in title: word = word + ' ' titlestr = titlestr + word category = stig[a] cl.train(titlestr, category) i += 1 else: for word in title: word = word + ' ' titlestr = titlestr + word print "\n" print cl.fprob(titlestr, 'A') print "\n"
#!/usr/bin/env python # -*- coding: utf-8 -*- import docclass cl = docclass.classifier(docclass.getwords) docclass.sampletrain(cl) print(cl.weightedprob('money', 'good', cl.fprob)) docclass.sampletrain(cl) print(cl.weightedprob('money', 'good', cl.fprob))
import docclass reload(docclass) cl = docclass.classifier(docclass.getwords) docclass.sampledata(cl) print(cl.weightedprob("money", "good", cl.fprob))
def testBasic(self): cl = docclass.classifier(docclass.getwords) cl.setdb('test.db') cl.train('spam spam spam', 'bad') self.assertEquals(1.0, cl.fprob('spam', 'bad'))
def data_Fetch_Train(self): self.parent.geometry("780x595+200+50") # GUI For Fetching and Training self.frame3 = Frame(self.parent, bg='wheat2') self.frame3.grid(row=2, column=0, sticky=W, stick=W) self.label6 = Label(self.frame3, text="Individual Books", fg='black', font=("Helvetica", 12), bg="wheat2") self.label6.grid(row=0, column=0, padx=5, stick=S, sticky=S) self.label7 = Label(self.frame3, text="Top 3 Estimates ", fg='black', font=("Helvetica", 12), bg="wheat2") self.label7.grid(row=0, column=1, padx=5, stick=S, sticky=S) self.listBox1 = Listbox(self.frame3, width=50, height=10, font=("Helvetica", 8)) self.listBox1.grid(row=1, rowspan=5, column=0, pady=5, padx=140) self.listBox1.bind('<<ListboxSelect>>', self.listClicked) self.labelp1 = Label(self.frame3, text="one ", font=("Helvetica", 10)) self.labelp1.grid(row=2, column=1, padx=5) self.labelp2 = Label(self.frame3, text=" Two", font=("Helvetica", 10)) self.labelp2.grid(row=3, column=1, padx=5) self.labelp3 = Label(self.frame3, text=" Three", font=("Helvetica", 10)) self.labelp3.grid(row=4, column=1, padx=5) ################# Analysis part GUI self.frame4 = Frame(self.parent, bg='wheat2') self.frame4.grid(row=3, column=0) self.l8 = Label(self.frame4, text='Accuracy analysis based \n on Genres: ', font=("Helvetica", 10), bg='wheat2') self.l8.grid(row=0, column=0) self.listbox2 = Listbox(self.frame4, height=8) self.listbox2.grid(row=1, column=0) self.listbox2.bind('<<ListboxSelect>>', self.analysis) self.text = Text(self.frame4, height=8, width=30) self.text.grid(row=1, column=1) # Fetching part self.clasifier = docclass.classifier(docclass.getwords) data = self.GetgenresFromFile() data = [data[0], data[1]] self.BooksDATA = self.Crawler(data) print self.BooksDATA # Training Part self.listboxinfo = [] for key, value in self.BooksDATA.items(): # print key self.listbox2.insert(END, key) for val in value: print key, " --> ", val[1] self.listboxinfo.append(val[0] + '- ' + val[1] + '-' + key) self.clasifier.train(val[1], key) random.shuffle(self.listboxinfo, random.random) for i in self.listboxinfo: self.listBox1.insert(END, i) self.label5.config(text="< Trained >", fg='black', bg='Gold')
def test_incf(self): clas = docclass.classifier(docclass.getwords) origin_str = 'the quick brown for jumps over the lazy dog' clas.train('the quick brown for jumps over the lazy dog', 'good') str_tobe = docclass.getwords(origin_str) self.assertDictEqual(clas.fc, str_tobe)
#!/usr/bin/env python import docclass classifier = docclass.classifier() classifier.train('the quick brown fox jumps over the lazy dog', 'good') classifier.train('make quick money in the online casino', 'bad') print classifier.get_word_count_per_tag('quick', 'good') print classifier.get_word_count_per_tag('quick', 'bad')