Ejemplo n.º 1
0
class Class():
	
	def __init__(self, name, content, count):
		self.name = name
		self.contentRaw = content
		self.tokens = Tokenizer(content)
		self.condProb = self.tokens.getTokens()
		self.count = count
		self.prior = 0.0

	def setPrior(self, prior):
		self.prior = prior

	def condProbs(self):
		return self.condProb

	def condProb(self, token):
		return self.condProb[token]

	def getName(self):
		return self.name

	def getTokens(self):
		return self.tokens;

	def getTokenSum(self):
		return len(self.tokens.getTokens())

	def getTokenSumIgnoreDuplicates(self):
		count = 0
		for t in self.tokens.getTokens():
			count += self.tokens.getTokens()[t]
		return count
def main():
	
	folders = {}
	folders["politik"] = "data/politik"
	folders["sport"] = "data/sport"
	folders["wirtschaft"] = "data/wirtschaft"

	bank = ClassBank()
	l = Loader()

	# train data
	for classname, folder in folders.iteritems():
		count = 0
		content = ""
		for file in os.listdir(folder + "/train/"):
			if file.endswith(".txt"):
				count = count + 1
				content = content + " " + l.load_txt(folder + "/train/" + file)
		c = Class(classname, content, count)
		bank.addClass(c)

 	bank.train()
 	c = Classifier()

 	# test data
 	for classname, folder in folders.iteritems():
 		print "\n=== Testing",classname, "===\n"
		for file in os.listdir(folder + "/test/"):
			if file.endswith(".txt"):
				tokenizer = Tokenizer(l.load_txt(folder + "/test/" + file))
				classifiedClass = c.classify(tokenizer.getTokens(), bank)
				print file,"=",classifiedClass.getName()
Ejemplo n.º 3
0
class ClassBank:
    def __init__(self):
        self.classes = {}
        self.documentCount = 0
        self.tokenizer = Tokenizer("")

    def addClass(self, classInst):
        self.classes[classInst.getName()] = classInst
        self.tokenizer.tokenize(classInst.contentRaw)
        self.documentCount = self.documentCount + classInst.count

    def getClass(self, name):
        if name in self.classes:
            return self.classes[name]
        return false

    def getClasses(self):
        return self.classes

    def getVocabulary(self):
        return self.tokenizer

    def getVocabularySum(self):
        return len(self.tokenizer.getTokens())

    def train(self):
        v = self.getVocabulary().getTokens()
        n = self.documentCount
        for c in self.classes:
            c = self.classes[c]
            c.setPrior(c.count / n)
            t = c.getTokens().getTokens()
            for key in c.condProb:
                c.condProb[key] = (float)(t[key] + 1) / (len(t) + v[key])
Ejemplo n.º 4
0
	def __init__(self, link, pagetitle, outgoing, html):
		self.name 		= link
		self.title 		= pagetitle
		self.outLinks 	= {}
		self.incoming	= {}
		self.content	= html
		self.pageRank   = 1
		t	 			= Tokenizer(self.content) 
		self.tokens 	= t.getTokens()
		
		for ol in outgoing:
			self.addOut(ol)
class ClassBank():
	
	def __init__(self):
		self.classes = {}
		self.documentCount = 0
		self.tokenizer = Tokenizer("");

	def addClass(self, classInst):
		self.classes[classInst.getName()] = classInst
		self.tokenizer.tokenize(classInst.contentRaw)
		self.documentCount = self.documentCount + classInst.count

	def getClass(self, name):
		if name in self.classes:
			return self.classes[ name ]
		return false

	def getClasses(self):
		return self.classes

	def getVocabulary(self):
		return self.tokenizer

	def getVocabularySum(self):
		return len(self.tokenizer.getTokens())

	def train(self):
		v = self.getVocabulary().getTokens()
		n = self.documentCount
		for c in self.classes:
			c = self.classes[c]
			c.setPrior(c.count/n)
			t = c.getTokens().getTokens()
			tCount = 0
			for tKey, tValue in t.iteritems():
				tCount = tCount + (tValue + 1)
			for key, value in v.iteritems():
				vCount = 0
				if key in t:
					vCount = t[key]
				c.condProb[key] = (vCount + 1)/(tCount + len(v))
Ejemplo n.º 6
0
class Scorer():
	
	def __init__(self, phrase, index):
		self.tokens = Tokenizer( phrase )
		self.index = index
		self.ranking = {}
		self.lengths = {}
		self.tlength = 0
		self.calc_document_length()
		self.calc_query_length()
		self.calc_ranking()

	def calc_document_length(self):
		for i in self.index.index:
			urls = self.index.index[ i ].urlList
			for d in urls.iterkeys():
				if d not in self.lengths:
					self.lengths[ d ] = 0
				self.lengths[ d ] += math.pow( self.calc_tf( urls[ d ] ) * self.calc_dtf( len( self.index.bank.urls ), i ), 2 )
		for d in self.lengths:
			self.lengths[ d ] = math.sqrt( self.lengths[ d ] )

	def calc_query_length(self):

		for t in self.tokens.getTokens():
			self.tlength += math.pow( self.calc_tf( self.get_query_term_length( t ) ) * self.calc_dtf( len( self.index.bank.urls ), t ), 2 )
		self.tlength = math.sqrt( self.tlength )

	def calc_ranking(self):
		for t in self.tokens.getTokens():
			
			it = self.index.getIndexToken( t )
			dtf = self.calc_dtf( len( self.index.bank.urls ), t )

			for d in it.urlList.iterkeys():	
				tf = self.calc_tf( it.urlList[ d ] )
				wtq = tf * dtf

				wtf = self.calc_tf( self.get_query_term_length( t ) )
				wtd = wtf * dtf
				
				if d not in self.ranking:
					self.ranking[ d ] = 0

				self.ranking[ d ] += ( wtq * wtd )

		for d in self.ranking:
			self.ranking[ d ] = self.ranking[ d ] / ( self.lengths[ d ] * self.tlength )

	def calc_tf(self, val):
		return ( 1 + math.log10( val ) )

	def calc_dtf(self, n, token):
		return math.log10( float( n ) / float( self.index.getDocumentFrequency( token ) ) )

	def get_query_term_length(self, token):
		count = 0
		for t in self.tokens.getTokens():
			if t == token:
				count = count + 1
		return count

	def printScoring(self):
		printable = "[";
		for t in self.tokens.getTokens():
			printable += "'%s', " % ( t )
		printable = printable[:-2] + "]\n"
		for item in sorted( self.ranking.items(), key=lambda x: x[1], reverse=True ):
			printable += "%s:\t%.6f\n" % (item[0], item[1] )
		print (printable)

	def printDocumentLength(self):
		printable = "";
		for item in sorted( self.lengths ):
			printable += "%s:\t%.6f\n" % ( item, self.lengths[ item ] )
		print (printable)