def upload_csv_button(self): self.file_path = tkFileDialog.askopenfilename( initialdir='/', title='Select file', filetypes=(('csv files', '*.csv'), ('all files', '*.*'))) with open(self.file_path) as f: s = f.read() my_d = repr(s) docclass.getwords(my_d) c1 = docclass.classifier(docclass.getwords) docclass.sampletrain(c1)
def get_article_words_count(feedlist): allwords = {} articlewords = [] articletitles = set() ec = 0 # loop over every feed for feed in feedlist: f = feedparser.parse(feed) for e in f.entries: # ignore identical articles if e.title in articletitles: continue # extract the words txt = e.title.encode('utf-8') + stripHTML(e.description.encode('utf-8')) words = getwords(txt) articlewords.append({}) articletitles.add(e.title) # inc counts of all words in an article in allwords, articlewords for word in words: allwords.setdefault(word, 0) allwords[word] += 1 articlewords[ec].setdefault(word, 0) articlewords[ec][word] += 1 ec = ec + 1 return allwords, articlewords, list(articletitles)
def parse_netflix_data(feed_file, con, predicted): # Get all features from database and saved into array cur = con.execute('select * from fc'); features = [f[0] for f in cur] titles = [] data = [] feeds = feedparser.parse(feed_file) entries = feeds['entries'] for e in range(len(entries)): entry = entries[e] title = '' publisher = '' summary = '' if 'title' in entry: title = entry['title'] if 'publisher' in entry: publisher = entry['publisher'] if 'summary' in entry: summary = entry['summary'] fulltext = '{} {} {}'.format(title, publisher, summary) words = docclass.getwords(fulltext) # Calculate word count wc = {} for w in words: wc.setdefault(w, 0) wc[w] += 1 arr_wc = [] # Convert wc to colon-separated term:value for term, value in wc.items(): # Find index of term in entries if term in features: arr_wc.append('{}:{}'.format(features.index(term), value)) # Get actual category, then compare it with predicted # Insert result (-1 or +1) at the 1st column (actual,) = con.execute('select actual_category from entry where title =?', (title,)).fetchone(); if actual: if actual == predicted: arr_wc.insert(0, '+1') else: arr_wc.insert(0, '-1') else: arr_wc.insert(0, '-1') # Join array into space separated value str_wc = ' '.join(arr_wc) titles.append(title) data.append(str_wc) return (titles, features, data)
def analysis(self, event): self.text.delete(1.0, END) index = self.listbox2.curselection() choice = self.listbox2.get(index) choice2 = "Genre: " + choice + '\n' GenresBooks = self.BooksDATA[choice] totalB = "Total Number Of Books: " + str(len(GenresBooks)) self.text.insert(END, choice2) self.text.insert(END, totalB) predictedGenre = [] Books = self.BooksDATA[choice] for i in Books: self.selectedBook = i[1] # categories = self.clasifier.categories() estimates = [] for cat in self.clasifier.categories(): prob = 0 for word in docclass.getwords(self.selectedBook): prob += self.clasifier.fprob(word, cat, default_prob=0) if prob != 0: estimates.append([cat, prob]) estimates = sorted(estimates, key=lambda x: x[1], reverse=True) estimates = estimates[0] predictedGenre.append(estimates) print predictedGenre c = len(self.BooksDATA[choice]) wrong = 0 for j in predictedGenre: if j[0] != choice: c -= 1 wrong += 1 correct = '\nCorrectly predicted Books: ' + str(c) + '\n' wrongg = 'Wrongly Predicted Books: ' + str(wrong) + '\n' accuracy = 'Accuracy: ' + str( (float(c) / len(self.BooksDATA[choice])) * 100) self.text.insert(END, correct) self.text.insert(END, wrongg) self.text.insert(END, accuracy)
def testStripDuplicates(self): self.assertEquals(['mail', 'spam'], sorted(list(docclass.getwords('spam mail spam'))))
def listClicked(self, event): self.labelp1.config(text=' ', bg='wheat2') self.labelp2.config(text=' ', bg='wheat2') self.labelp3.config(text=' ', bg='wheat2') index = self.listBox1.curselection() temp = self.listBox1.get(index[0]) catt = temp.split('-')[2] temp = temp.split('-')[1] temp = temp[1:] self.selectedBook = temp # categories = self.clasifier.categories() estimates = [] for cat in self.clasifier.categories(): prob = 0 for word in docclass.getwords(self.selectedBook): prob += self.clasifier.fprob(word, cat, default_prob=0) if prob != 0: estimates.append([cat, prob]) estimates = sorted(estimates, key=lambda x: x[1], reverse=True) if len(estimates) == 1: line = str(estimates[0][0]) + " -->" + str(estimates[0][1]) if estimates[0][0] == catt: self.labelp1.config(text=line, bg='green') else: print estimates self.labelp1.config(text=line, bg='red') elif len(estimates) == 2: line = str(estimates[0][0]) + " -->" + str(estimates[0][1]) line2 = str(estimates[1][0]) + " -->" + str(estimates[1][1]) if estimates[0][0] == catt: self.labelp1.config(text=line, bg='green') else: print estimates self.labelp1.config(text=line, bg='red') if estimates[1][0] == catt: self.labelp2.config(text=line2, bg='green') else: self.labelp2.config(text=line2, bg='red') elif len(estimates) == 0: print 'wait what' else: line = str(estimates[0][0]) + " -->" + str(estimates[0][1]) line2 = str(estimates[1][0]) + " -->" + str(estimates[1][1]) line3 = str(estimates[2][0]) + " -->" + str(estimates[2][1]) if estimates[0][0] == catt: self.labelp1.config(text=line, bg='green') else: print estimates self.labelp1.config(text=line, bg='red') if estimates[1][0] == catt: self.labelp2.config(text=line2, bg='green') else: self.labelp2.config(text=line2, bg='red') if estimates[2][0] == catt: self.labelp3.config(text=line3, bg='green') else: self.labelp3.config(text=line3, bg='red')
def test_incf(self): clas = docclass.classifier(docclass.getwords) origin_str = 'the quick brown for jumps over the lazy dog' clas.train('the quick brown for jumps over the lazy dog', 'good') str_tobe = docclass.getwords(origin_str) self.assertDictEqual(clas.fc, str_tobe)
def test_getwords(self): words_source = 'disk*jdhs&342nhek[]989c12cnjshdsjkv774*j' words_result_dict = dict([('disk', 1), ('jdhs', 1), ('nhek', 1), ('cnjshdsjkv', 1)]) self.assertDictEqual(words_result_dict, docclass.getwords(words_source))
import docclass docclass.getwords('python is a dynamic language') cl = docclass.naivebayes(docclass.getwords) cl.setdb('test.db') cl.train('pythons are constrictors', 'snake') cl.train('python has dynamic types', 'language') cl.train('python was developed as scripting language', 'language') cl.classify('dynamic programming') cl.classify('boa constrictors') exit()