コード例 #1
0
def main():
    cl = docclass.classifier(docclass.getwords)
    cl.setdb('test1.db')
    docclass.sampletrain(cl)
    print cl.fprob('quick', 'good')
    print cl.weighted_prob('money', 'good', cl.fprob)
    docclass.sampletrain(cl)
    print cl.weighted_prob('money', 'good', cl.fprob)

    clnb = docclass.naivebayes(docclass.getwords)
    clnb.setdb('test1.db')
    docclass.sampletrain(clnb)
    print clnb.prob('quick rabbit', 'good')
    print clnb.prob('quick rabbit', 'bad')
    print clnb.classify('quick rabbit', default='unknown')
    print clnb.classify('quick money', default='unknown')
    clnb.setthreshold('bad', 3.0)
    print clnb.classify('quick money', default='unknown')

    clfs = docclass.fisherclassifier(docclass.getwords)
    clfs.setdb('test1.db')
    docclass.sampletrain(clfs)
    print clfs.cprob('quick', 'good')
    print clfs.cprob('money', 'bad')
    print clfs.weighted_prob('money', 'bad', clfs.cprob)
    print clfs.fisherprob('quick rabbit', 'good')
    print clfs.fisherprob('quick rabbit', 'bad')
    print clfs.classify('quick rabbit')
    print clfs.classify('quick money')

    clfs2 = docclass.fisherclassifier(docclass.getwords)
    clfs2.setdb('test1.db')
    feedclassifier('feed_sample2.rss', clfs2)
    print clfs2.cprob('Pandas', 'python')
    print clfs2.cprob('python', 'python')
コード例 #2
0
 def guess_the_prof(self):
     if self.method=='naive':
         cl=docclass.naivebayes(docclass.getwords)
         prof_sel=self.box.get() # This is the professor whose department we want to guess
         doc_of_prof=self.profs_data[prof_sel]
         self.trainer(prof_sel,cl)
         all_thresh=self.lb.get(0,END)
         thresholds=[]
         for item in all_thresh:
             merged=item.split('-')
             threshnum=float(merged[0])
             thresh=merged[1]
             thresholds.append((thresh,threshnum))
         for thr,num in thresholds:
             cl.setthreshold(thr,num)
         self.pdep= cl.classify(doc_of_prof,default='unknown')
     else:
         cl=docclass.fisherclassifier(docclass.getwords)
         prof_sel=self.box.get() # This is the professor whose department we want to guess
         doc_of_prof=self.profs_data[prof_sel]
         self.trainer(prof_sel,cl)
         all_thresh=self.lb.get(0,END)
         thresholds=[]
         for item in all_thresh:
             merged=item.split('-')
             threshnum=float(merged[0])
             thresh=merged[1]
             thresholds.append((thresh,threshnum))
         for thr,num in thresholds:
             cl.setminimum(thr,num)
         self.pdep= cl.classify(doc_of_prof,default='unknown')
     self.verdict()
コード例 #3
0
def train2():
    import docclass as docclass
    cl1 = docclass.naivebayes(docclass.getwords)
    cl1.setdb('test2.db')
    for a in range(2000):
        docclass.sampletrain(cl1)
    cl1.con.commit()
コード例 #4
0
    def fetch(self):  # fetching phase
        self.color.config(background='yellow')
        self.course_list = []
        self.update()
        url = self.entry.get()
        self.dataobj = Data()  # creating data obj
        self.dataobj.init_data(url)
        self.courses = self.dataobj.courselist.keys()  # getting keys
        self.courses.sort()  # sorting keys
        self.obj_list = []
        for i in self.courses:
            self.obj_list.append(self.dataobj.courselist[i])
        self.classifier_obj = docclass.naivebayes(docclass.getwords)
        for i in self.obj_list:  # TRANING PHASE
            self.classifier_obj.train(i.split_name.lower(), i.first_code)
        r1 = re.compile("(.*?)\s*\(")
        for i in self.courses:  # adding courses to listbox
            course_name = self.dataobj.courselist[i].name
            name = r1.match(course_name)
            if name != None:
                name1 = i + '' + '(' + name.group(1) + ')'
            else:
                name1 = i + ' ' + '(' + course_name + ')'
            self.coursesListbox.insert(END, name1)
        for z in self.courses:  # adding course category to other listbox
            if self.dataobj.courselist[z].first_code not in self.course_list:
                self.course_list.append(self.dataobj.courselist[z].first_code)

            code = self.dataobj.courselist[z].first_code
            if code not in self.programsListbox.get(0, END):
                self.programsListbox.insert(END, code)
        self.color.config(background='green')
        self.update()
        self.coursesListbox.bind('<<ListboxSelect>>', self.estimate)
        self.programsListbox.bind('<<ListboxSelect>>', self.analyze)
コード例 #5
0
 def train_classifier(self):  # Trainer
     naive_obj = docclass.naivebayes(
         docclass.getwords)  # Creates Naive Object
     for prof, pub in self.faculty_members.items():
         for publication in pub.publications:
             naive_obj.train(
                 publication, prof
             )  # Trains the object with the pubication and professor
     return naive_obj
コード例 #6
0
def test_nb_prob():
    sys.stderr.write("testing computation of naive bayes probability...\n")
    reload(docclass)
    cl=docclass.naivebayes(docclass.getwords)
    docclass.sampletrain(cl)
    p1 = cl.prob('quick rabbit', 'good')
    sys.stdout.write("%f\n" %(p1)) # 0.15624999..
    p2 = cl.prob('quick rabbit', 'bad')
    sys.stdout.write("%f\n" %(p2)) # 0.05000000...
コード例 #7
0
  def testClassify(self):
    cl = docclass.naivebayes(docclass.getwords)
    cl.setdb('test.db')
    docclass.sampletrain(cl)
    self.assertEquals('good', cl.classify('quick rabbit', default='unknown'))
    self.assertEquals('bad', cl.classify('quick money', default='unknown'))

    cl.setthreshold('bad', 3.0)
    self.assertEquals('unknown', cl.classify('quick money', default='unknown'))

    for i in range(10): docclass.sampletrain(cl)
    self.assertEquals('bad', cl.classify('quick money', default='unknown'))
コード例 #8
0
ファイル: mp4_real.py プロジェクト: aycaikaya/Python
    def run_spamche_button(self):
        c1 = docclass.naivebayes(docclass.getwords)
        docclass.sampletrain(c1)
        c1.setthreshold('bad', 3.0)
        user_input = self.lbox.get()
        if user_input == '':
            self.var.set('Please load the spam training test!')
        else:
            ans = c1.classify(user_input, default='unkown')
            if ans == 'bad':
                self.var.set('spam')

            else:
                self.var.set('ham')
コード例 #9
0
def test_nb_classify():
    sys.stderr.write("testing naive bayes classification...\n")
    reload(docclass)
    cl=docclass.naivebayes(docclass.getwords)
    docclass.sampletrain(cl)
    c1 = cl.classify('quick rabbit', default='unknown')
    sys.stdout.write("%s\n" %(c1)) # 'good'
    c2 = cl.classify('quick money', default='unknown')
    sys.stdout.write("%s\n" %(c2)) # 'bad'
    # test threshold
    cl.setthreshold('bad', 3.0)
    c3 = cl.classify('quick money', default='unknown')
    sys.stdout.write("%s\n" %(c3)) # 'unknown'
    for i in range(10): docclass.sampletrain(cl)
    c4 = cl.classify('quick money', default='unknown')
    sys.stdout.write("%s\n" %(c4)) # 'bad'
コード例 #10
0
        def predict(event):
            """predicts results by training classifier then calls printOut function"""
            cl = docclass.naivebayes(docclass.getwords)

            # training classifier
            for i in self.trainer:
                if self.trainer[i][0] !="" and self.trainer[i][1] !="":
                    cl.train(self.trainer[i][0], self.trainer[i][1])

            # predicting grades for other courses
            for i in self.guess:
                if self.guess[i][0] !="":
                    self.guess[i][1] = cl.classify(self.guess[i][0], default="unknown")

            # printing predicted results to text widget
            self.printOut()
コード例 #11
0
    def fetch(self):

        self.fetcher.fetch_members()
        self.members = self.fetcher.fetch_member_publications()
        self.cl = docclass.naivebayes(docclass.getwords)

        for mem, obj in self.members.items():
            for pub in obj.publication:
                self.cl.train(pub, mem)
        self.projects = self.fetcher.fetch_projects()

        projects = self.projects.keys()
        projects.sort()

        for i in projects:
            self.project_listbox.insert(END, i)
コード例 #12
0
        def predict(event):
            """predicts results by training classifier then calls printOut function"""
            cl = docclass.naivebayes(docclass.getwords)

            # training classifier
            for i in self.trainer:
                if self.trainer[i][0] != "" and self.trainer[i][1] != "":
                    cl.train(self.trainer[i][0], self.trainer[i][1])

            # predicting grades for other courses
            for i in self.guess:
                if self.guess[i][0] != "":
                    self.guess[i][1] = cl.classify(self.guess[i][0],
                                                   default="unknown")

            # printing predicted results to text widget
            self.printOut()
コード例 #13
0
    def Bayes_prediction(self):
        cl = docclass.naivebayes(docclass.getwords)
        for category in self.list_of_department:  ##Category=departmnet
            for teacher in self.dictionary_of_department_and_professor[
                    category]:
                if teacher == self.combovar.get():
                    continue
                else:
                    cl.train(
                        self.dictionary_as_database[teacher], category
                    )  #self.dictionary_as_database[teacher]=professor's information

        for item in self.list_of_thresholds:
            department, score = item
            cl.setthreshold(department, score)
        prediction = cl.classify(
            self.dictionary_as_database[self.combovar.get()], default=None)
        self.help_to_write(prediction)
コード例 #14
0
ファイル: run.py プロジェクト: wz125/courses
def bayesianClassification():
  print '## Bayesian Classification'
  allw,artw,artt,wordmatrix,wordvec=readpickle()
  def wordmatrixfeatures(x):
    return [wordvec[w] for w in range(len(x)) if x[w]>0]
  wordmatrixfeatures(wordmatrix[0])
  #sys.path.append("../chapter6-filtering")
  import docclass
  classifier=docclass.naivebayes(wordmatrixfeatures)
  classifier.setdb('newstest.db')
  print artt[0].encode("utf-8")
  # Train this as an 'iraq' story
  classifier.train(wordmatrix[0],'iraq')
  print artt[1].encode("utf-8")
  # Train this as an 'india' story
  classifier.train(wordmatrix[1],'india')
  print artt[2].encode("utf-8")
  # How is this story classified?
  print classifier.classify(wordmatrix[1])
コード例 #15
0
    def crawl(self, pages, depth=2, maxpages=1000):
        import docclass
        classifier = docclass.naivebayes(docclass.getwords)
        docclass.sampletrain(classifier)
        iter = 0
        for i in range(depth):
            newpages = set()
            for page in pages:
                print iter
                iter = iter + 1
                if (iter > maxpages):
                    return
                try:
                    c = urllib2.urlopen(page)
                except:
                    print "Could not open %s" % page
                    continue
                content = c.read()
                charset = chardet.detect(content[:400])['encoding']
                content = content.decode(charset, "ignore").encode('UTF-8')

                soup = BeautifulSoup(content, 'html.parser')
                self.addtoindex(page, soup, classifier)

                links = soup('a')
                for link in links:
                    #print link
                    if ('href' in dict(link.attrs)):
                        url = urljoin(page, link['href'])
                        if url.find("'") != -1: continue
                        url = url.split('#')[0]  #去掉位置部分
                        if url[0:4] == 'http' and not self.isindexed(url):
                            newpages.add(url)
                        linkText = self.gettextonly(link)
                        self.addlinkref(page, url, linkText)
                self.dbcommit()
            pages = newpages
コード例 #16
0
def train_naivebayes(train_path, test_path):
    cl = docclass.naivebayes(docclass.getwords)

    for filename in glob.glob(train_path):
        with open(filename, 'r') as f:
            f = f.read()
            label = filename.split('.')[3]
            cl.train(f, label)

    print "Train Done!"

    TP = 0.0
    FN = 0.0
    FP = 0.0
    TN = 0.0

    cl.setthreshold('ham', 2.4)
    for filename in glob.glob(test_path):
        with open(filename, 'r') as f:
            f = f.read()
            label = filename.split('.')[3]
            predict = cl.classify(f, default='Unknown')
            if label == 'spam' and predict == 'spam':
                TP += 1
            elif label == 'spam' and predict == 'ham':
                FN += 1
            elif label == 'ham' and predict == 'spam':
                FP += 1
            elif label == 'ham' and predict == 'ham':
                TN += 1
            else:
                print predict, label

    print "Test Done!"

    score(TP, FN, FP, TN)
コード例 #17
0
 def testProb(self):
   cl = docclass.naivebayes(docclass.getwords)
   cl.setdb('test.db')
   docclass.sampletrain(cl)
   self.assertAlmostEquals(0.15624999, cl.prob('good', 'quick rabbit'))
   self.assertAlmostEquals(0.05, cl.prob('bad', 'quick rabbit'))
コード例 #18
0
cl = docclass.classifier(docclass.get_words)

docclass.sampletrain(cl)

print cl.cat_count('good')
print cl.feat_cat_count('money','bad')
print cl.feat_cat_count('money','good')
print cl.fprob('money','good')
print cl.weightedprob('money','good',cl.fprob)

docclass.sampletrain(cl)

print cl.weightedprob('money','good',cl.fprob)
"""

cl = docclass.naivebayes(docclass.get_words)

docclass.sampletrain(cl)

print cl.cat_count('good')
print cl.feat_cat_count('money','bad')
print cl.feat_cat_count('money','good')
print cl.fprob('money','good')
print 'test'
print cl.weightedprob('money','good',cl.fprob)
print cl.weightedprob('money','good',cl.fprob)
print cl.prob('quick rabbit','good')
print cl.prob('QUICK RABBIT','good')

print cl.classify('quick rabbit',default = 'unkown')
コード例 #19
0
        print "original: " + str(emotion)
        for emotion_group in emotion_groups.keys():
            if emotion_groups[emotion_group].has_key(emotion):
                grouped_emotion = emotion_group
        print "updated: " + grouped_emotion
        print "get rid of removed!!!"
        if grouped_emotion != 'remove':
            emotion_list.append([text, emotion, grouped_emotion])
    return emotion_list
        


if __name__ == "__main__":
    #run3()
    fs = fetchBasicFeatureSet()
    processed_fs = processed_fs(fs)
    classifier = nbc.naivebayes(nbc.getwords)

    classifier.setdb();

    for item in processed_fs:
        text = item[0]
        emotion = item[1]
        grouped_emotion = item[2]

        classifier.train(text, grouped_emotion)

    #print classifier.classify("L'amour s'en va comme cette eau courante L'amour s'en va")
   

コード例 #20
0
import docclass as dc

classifier = dc.naivebayes(dc.getwords)
#classifier.setdb() #works!
#classifier.train("hello world","test_category")
コード例 #21
0
import docclass as dc
import os

training_path_spam = "training/spam/"
training_path_not_spam = "training/not_spam/"
testing_path_spam = "testing/spam/"
testing_path_not_spam = "testing/not_spam/"

train_set = dc.naivebayes(dc.getwords)
try:
    os.remove('mack.db')  # deletes old database
except:
    pass  # don't freak out if there's not already a database to delete
train_set.setdb('mack.db')  #creates fresh database


def train(filepath, desig):  #desig is 'spam' or 'not spam'
    file_list = os.listdir(filepath)
    for each in file_list:
        test_string = ""
        for line in open(filepath + each, 'r'):
            if line != '\n':
                train_set.train(line, desig)


def test(filepath):
    file_list = os.listdir(filepath)
    for each in file_list:
        test_string = ""
        for line in open(filepath + each, 'r'):
            if line != '\n':
コード例 #22
0
 def setUpClass(cls):
     cls.client = docclass.naivebayes(docclass.getwords)
     cls.db = cls.client.setdb(TEST_DATABASE)
 def train_classifier(
         self):  # crates and trains a naive bayes classifier object
     self.classifier = docclass.naivebayes(docclass.getwords)
     for member in self.faculty_members:  # training the naive bayes classifier object with the publications of each member
         for publication in self.faculty_members[member].publications:
             self.classifier.train(publication, member)
コード例 #24
0
ファイル: test.py プロジェクト: sevenry/python_learning_note


def kk():
    for i in range(4):
        print(i)
        if i+5>6: return i
    return i+1
    
    
a=kk()
print(a)'''


## 课后习题2
c2=docclass.naivebayes(docclass.getwords)

#docclass.sampletrain(c2)

c1=docclass.noignore(docclass.getwords)
#docclass.sampletrain(c2)
print('hhh')
#print(c2.prob('quick','good'))
#print(c1.prob('quick rabbit','bad'))
docclass.sampletrain(c1)
print(c1.prob('quick','good'))
#print(c1.prob('quick rabbit','bad'))

'''
#print(c1.classify('owns water',default='unknown'))
#print(c1.classify('quick money',default='unknown'))
コード例 #25
0
 def __init__(self,classifier_dict,feed_queue):
     self.con = psycopg2.connect("dbname='nlptweets' host='localhost' user='******' password='******'")
     self.cur = self.con.cursor()
     self.classifier = nbc.naivebayes(nbc.getwords,classifier_dict)
     self.feed_queue = feed_queue
     threading.Thread.__init__(self)
コード例 #26
0
 def naive_bayes(self):
     self.my_train = docclass.naivebayes(docclass.getwords)
     for key in self.courses_grades:
         if len(self.courses_grades[key]) == 3 and type(self.courses_grades[key][2])==unicode:
             self.my_train.train(self.courses_grades[key][2], self.courses_grades[key][0]) #course description as doc and course grade as class
             self.my_train.setthreshold(self.courses_grades[key][0], 1.0) #as mentioned in the assignment, I set the threshold to 1
コード例 #27
0
ファイル: test.py プロジェクト: Hrishi29/anwala.github.io
import docclass
from subprocess import check_output


cl = docclass.naivebayes(docclass.getwords)
#remove previous db file


cl.setdb('hrishi.db')
docclass.eTrain(cl)

for i in range(11,21):
		filename = 'test/spam' + str(i) +'.txt'
  
		with open(filename, 'r', encoding='utf-8') as testFile:
			print(filename, cl.classify(testFile.read()))
  

for i in range(11,21):
		filename = 'test/nonspam' + str(i) +'.txt'
  
		with open(filename, 'r', encoding='utf-8') as testFile1:
			print(filename, cl.classify(testFile1.read()))
#classify text: "the banking dinner" as spam or not spam
#print( cl.classify('the banking dinner') )
コード例 #28
0
 def setUpClass(cls):
     cls.client = docclass.naivebayes(docclass.getwords)
     cls.db = cls.client.setdb(TEST_DATABASE)