def main(): folders = {} folders["politik"] = "data/politik" folders["sport"] = "data/sport" folders["wirtschaft"] = "data/wirtschaft" bank = ClassBank() l = Loader() # train data for classname, folder in folders.iteritems(): count = 0 content = "" for file in os.listdir(folder + "/train/"): if file.endswith(".txt"): count = count + 1 content = content + " " + l.load_txt(folder + "/train/" + file) c = Class(classname, content, count) bank.addClass(c) bank.train() c = Classifier() # test data for classname, folder in folders.iteritems(): print "\n=== Testing",classname, "===\n" for file in os.listdir(folder + "/test/"): if file.endswith(".txt"): tokenizer = Tokenizer(l.load_txt(folder + "/test/" + file)) classifiedClass = c.classify(tokenizer.getTokens(), bank) print file,"=",classifiedClass.getName()
def main(): c = Class( "Politik", "Das ist der Inhalt der Klasse", 10 ) d = Class( "Wirtschaft", "Noch eine Geschichte mit Inhalt", 10 ) b = ClassBank() b.addClass( c ) b.addClass( d ) b.train() classes = b.getClasses() for c in classes: for t in classes[c].condProb: prob = classes[c].condProb print t,"=",prob[t]