def testTrain(self): c = Classifier(getwords) item = "Hello hello world, my name is Python." cat = "Good" c.train(item, cat) self.assertEqual(c.catcount("Good"), 1) self.assertEqual(c.fcount("hello", "Good"), 1) self.assertFalse(c.fc.has_key("my"))
countnews['nontech']=0 countnews['test']=0 c = Classifier(getwords, initprob=0.5) print "--------------------News from trainTech------------------------" for feed in trainTech: f=feedparser.parse(feed) for e in f.entries: print '\n---------------------------' fulltext=stripHTML(e.title+' '+e.description) print fulltext countnews['tech']+=1 c.train(fulltext,"Tech") print "----------------------------------------------------------------" print "----------------------------------------------------------------" print "----------------------------------------------------------------" print "--------------------News from trainNonTech------------------------" for feed in trainNonTech: f=feedparser.parse(feed) for e in f.entries: print '\n---------------------------' fulltext=stripHTML(e.title+' '+e.description) print fulltext countnews['nontech']+=1 c.train(fulltext, "NonTech")
countnews['test']+=1 print "----------------------------------------------------------------" print "----------------------------------------------------------------" print "----------------------------------------------------------------" print 'Number of used trainings samples in categorie tech',countnews['tech'] print 'Number of used trainings samples in categorie notech',countnews['nontech'] print 'Number of used test samples',countnews['test'] print '--'*30 rss_classifier = Classifier() for tech in train_data["good"]: rss_classifier.train(tech, "good") for nontech in train_data["bad"]: rss_classifier.train(nontech, "bad") print "---- training finished ---------------------" for test in data: g_pb = rss_classifier.prob(test, "good") b_pb = rss_classifier.prob(test, "bad") # Normalisierung der Wahrscheinlichkeiten g_pb_n = g_pb /(g_pb + b_pb) b_pb_n = b_pb /(g_pb + b_pb) print test print "good: ", g_pb_n, " bad: ", b_pb_n
def testClassifier(self): c = Classifier(getwords) c.train("nobody owns the water", "Good") c.train("the quick rabbit jumps fences", "Good") c.train("buy pharmaceuticals now", "Bad") c.train("make quick money at the online casino", "Bad") c.train("the quick brown fox jumps", "Good") c.train("next meeting is at night", "Good") c.train("meeting with your superstar", "Bad") c.train("money like water", "Bad") # added quick to the test string, because with 'money jumps' Good and Bad got the same value. self.assertEqual(c.classify("the money jumps quick"), "Good")
countnews['test']=0 c = Classifier(getwords, initprob=0.5) print "--------------------News from trainTech------------------------" for feed in trainTech: f=feedparser.parse(feed) for e in f.entries: print '\n---------------------------' fulltext=stripHTML(e.title+' '+e.description) print fulltext countnews['tech']+=1 c.train(fulltext,"Tech") print "----------------------------------------------------------------" print "----------------------------------------------------------------" print "----------------------------------------------------------------" print "--------------------News from trainPolitics------------------------" for feed in trainPolitics: f=feedparser.parse(feed) for e in f.entries: print '\n---------------------------' fulltext=stripHTML(e.title+' '+e.description) print fulltext countnews['politics']+=1 c.train(fulltext, "Politics")