class Class(): def __init__(self, name, content, count): self.name = name self.contentRaw = content self.tokens = Tokenizer(content) self.condProb = self.tokens.getTokens() self.count = count self.prior = 0.0 def setPrior(self, prior): self.prior = prior def condProbs(self): return self.condProb def condProb(self, token): return self.condProb[token] def getName(self): return self.name def getTokens(self): return self.tokens; def getTokenSum(self): return len(self.tokens.getTokens()) def getTokenSumIgnoreDuplicates(self): count = 0 for t in self.tokens.getTokens(): count += self.tokens.getTokens()[t] return count
def main(): folders = {} folders["politik"] = "data/politik" folders["sport"] = "data/sport" folders["wirtschaft"] = "data/wirtschaft" bank = ClassBank() l = Loader() # train data for classname, folder in folders.iteritems(): count = 0 content = "" for file in os.listdir(folder + "/train/"): if file.endswith(".txt"): count = count + 1 content = content + " " + l.load_txt(folder + "/train/" + file) c = Class(classname, content, count) bank.addClass(c) bank.train() c = Classifier() # test data for classname, folder in folders.iteritems(): print "\n=== Testing",classname, "===\n" for file in os.listdir(folder + "/test/"): if file.endswith(".txt"): tokenizer = Tokenizer(l.load_txt(folder + "/test/" + file)) classifiedClass = c.classify(tokenizer.getTokens(), bank) print file,"=",classifiedClass.getName()
class ClassBank: def __init__(self): self.classes = {} self.documentCount = 0 self.tokenizer = Tokenizer("") def addClass(self, classInst): self.classes[classInst.getName()] = classInst self.tokenizer.tokenize(classInst.contentRaw) self.documentCount = self.documentCount + classInst.count def getClass(self, name): if name in self.classes: return self.classes[name] return false def getClasses(self): return self.classes def getVocabulary(self): return self.tokenizer def getVocabularySum(self): return len(self.tokenizer.getTokens()) def train(self): v = self.getVocabulary().getTokens() n = self.documentCount for c in self.classes: c = self.classes[c] c.setPrior(c.count / n) t = c.getTokens().getTokens() for key in c.condProb: c.condProb[key] = (float)(t[key] + 1) / (len(t) + v[key])
def __init__(self, link, pagetitle, outgoing, html): self.name = link self.title = pagetitle self.outLinks = {} self.incoming = {} self.content = html self.pageRank = 1 t = Tokenizer(self.content) self.tokens = t.getTokens() for ol in outgoing: self.addOut(ol)
class ClassBank(): def __init__(self): self.classes = {} self.documentCount = 0 self.tokenizer = Tokenizer(""); def addClass(self, classInst): self.classes[classInst.getName()] = classInst self.tokenizer.tokenize(classInst.contentRaw) self.documentCount = self.documentCount + classInst.count def getClass(self, name): if name in self.classes: return self.classes[ name ] return false def getClasses(self): return self.classes def getVocabulary(self): return self.tokenizer def getVocabularySum(self): return len(self.tokenizer.getTokens()) def train(self): v = self.getVocabulary().getTokens() n = self.documentCount for c in self.classes: c = self.classes[c] c.setPrior(c.count/n) t = c.getTokens().getTokens() tCount = 0 for tKey, tValue in t.iteritems(): tCount = tCount + (tValue + 1) for key, value in v.iteritems(): vCount = 0 if key in t: vCount = t[key] c.condProb[key] = (vCount + 1)/(tCount + len(v))
class Scorer(): def __init__(self, phrase, index): self.tokens = Tokenizer( phrase ) self.index = index self.ranking = {} self.lengths = {} self.tlength = 0 self.calc_document_length() self.calc_query_length() self.calc_ranking() def calc_document_length(self): for i in self.index.index: urls = self.index.index[ i ].urlList for d in urls.iterkeys(): if d not in self.lengths: self.lengths[ d ] = 0 self.lengths[ d ] += math.pow( self.calc_tf( urls[ d ] ) * self.calc_dtf( len( self.index.bank.urls ), i ), 2 ) for d in self.lengths: self.lengths[ d ] = math.sqrt( self.lengths[ d ] ) def calc_query_length(self): for t in self.tokens.getTokens(): self.tlength += math.pow( self.calc_tf( self.get_query_term_length( t ) ) * self.calc_dtf( len( self.index.bank.urls ), t ), 2 ) self.tlength = math.sqrt( self.tlength ) def calc_ranking(self): for t in self.tokens.getTokens(): it = self.index.getIndexToken( t ) dtf = self.calc_dtf( len( self.index.bank.urls ), t ) for d in it.urlList.iterkeys(): tf = self.calc_tf( it.urlList[ d ] ) wtq = tf * dtf wtf = self.calc_tf( self.get_query_term_length( t ) ) wtd = wtf * dtf if d not in self.ranking: self.ranking[ d ] = 0 self.ranking[ d ] += ( wtq * wtd ) for d in self.ranking: self.ranking[ d ] = self.ranking[ d ] / ( self.lengths[ d ] * self.tlength ) def calc_tf(self, val): return ( 1 + math.log10( val ) ) def calc_dtf(self, n, token): return math.log10( float( n ) / float( self.index.getDocumentFrequency( token ) ) ) def get_query_term_length(self, token): count = 0 for t in self.tokens.getTokens(): if t == token: count = count + 1 return count def printScoring(self): printable = "["; for t in self.tokens.getTokens(): printable += "'%s', " % ( t ) printable = printable[:-2] + "]\n" for item in sorted( self.ranking.items(), key=lambda x: x[1], reverse=True ): printable += "%s:\t%.6f\n" % (item[0], item[1] ) print (printable) def printDocumentLength(self): printable = ""; for item in sorted( self.lengths ): printable += "%s:\t%.6f\n" % ( item, self.lengths[ item ] ) print (printable)