예제 #1
0
def test(doc, name):

	f= open(name,"w")

	frequencies0 = FeatureExtractor.frequency(doc[:2],True,True) #frequency count smoothed by 1
	frequencies1 = FeatureExtractor.augmented_frequency(frequencies0) # augmented frequencies taking into account document size
	frequencies = FeatureExtractor.idf(frequencies1) # idfs
	total = frequencies["**prob**"]
	totals = sum(total)

	priors =[0.33, 0.33, 0.329] # based on number of documents

	a = ["C2","IKEA_EN","IKEA_IT"]


	with open(doc[2],"r") as mefile:

		for line in mefile:
			lines = line.split('\t')
			ID = lines[0]
			words = lines[4].replace("<s>","").replace("</s>","").split(" ")
			pC2 = 0
			pIKEA_IT = 0
			pIKEA_EN = 0
			for word in words:
				
				if word in frequencies:
					pC2 += math.log((frequencies[word][0]))
					pIKEA_EN += math.log((frequencies[word][1]))
					pIKEA_IT += math.log((frequencies[word][2]))

				else:
					pC2 += math.log(0.5)
					pIKEA_EN += math.log(0.5)
					pIKEA_IT += math.log(0.5)
			
			b = [pC2+math.log(priors[0]),pIKEA_EN+math.log(priors[1]),pIKEA_IT+math.log(priors[2])]
			
			

			proposal = a[b.index(max(b))]
			f.write(ID+ "\t" + proposal + "\n")


	f.close()
예제 #2
0
def validate(doc, name):

	f= open(name,"w")

	frequencies0 = FeatureExtractor.frequency(doc[:2]) #frequency count smoothed by 1
	frequencies1 = FeatureExtractor.augmented_frequency(frequencies0) # augmented frequencies taking into account document size
	frequencies = FeatureExtractor.idf(frequencies1) # idfs
	total = frequencies["**prob**"]
	totals = sum(total)

	priors =[0.33, 0.33, 0.328] # based on number of documents


	a = ["C2","IKEA_EN","IKEA_IT"]
	correct = 0
	number = 0

	tpC2      = 0
	tpIKEA_EN = 0
	tpIKEA_IT = 0
	fpC2      = 0
	fpIKEA_EN = 0
	fpIKEA_IT = 0
	fnC2      = 0
	fnIKEA_EN = 0
	fnIKEA_IT = 0


	with open(doc[2],"r") as mefile:

		for line in mefile:
			lines = line.split('\t')
			ID = lines[1]
			words = lines[4].replace("<s>","").replace("</s>","").split(" ")
			pC2 = 0
			pIKEA_IT = 0
			pIKEA_EN = 0
			for word in words:
				
				if word in frequencies:
					pC2 += math.log((frequencies[word][0]))
					pIKEA_EN += math.log((frequencies[word][1]))
					pIKEA_IT += math.log((frequencies[word][2]))

				else:
					pC2 += math.log(0.5)
					pIKEA_EN += math.log(0.5)
					pIKEA_IT += math.log(0.5)
			
			b = [pC2+math.log(priors[0]),pIKEA_EN+math.log(priors[1]),pIKEA_IT+math.log(priors[2])]
			# other possibilities
			# d = [pC2,pIKEA_EN,pIKEA_IT] # without priors
			# c = [-pC2*priors[0],-pIKEA_EN*priors[1],-pIKEA_IT*priors[2]] # multiplying by priors
			
			

			proposal = a[b.index(max(b))]
			f.write(proposal + "\t" + ID + "\n")

			# calculate precision, recall, f1
			# count true positives, false positives, false negatives
			print proposal
			

			if ID == proposal:
				if ID == "C2":
					tpC2+=1
				elif ID == "IKEA_EN":
					tpIKEA_EN +=1
				elif ID == "IKEA_IT":
					tpIKEA_IT += 1
				correct += 1
			else:
				if ID == "C2":
					fnC2+=1
				elif ID == "IKEA_EN":
					fnIKEA_EN +=1
				elif ID == "IKEA_IT":
					fnIKEA_IT += 1
				if proposal == "C2":
					fpC2+=1
				elif proposal == "IKEA_EN":
					fpIKEA_EN +=1
				elif proposal == "IKEA_IT":
					fpIKEA_IT += 1

			number +=1

	print fnC2
	precisionC2 = tpC2 / ( tpC2 + fpC2 )
	precisionIKEA_IT = tpIKEA_IT / ( tpIKEA_IT + fpIKEA_IT)
	precisionIKEA_EN = tpIKEA_EN / ( tpIKEA_EN + fpIKEA_EN)
	precisions = [precisionC2, precisionIKEA_EN,precisionIKEA_IT]
	recallC2 = tpC2 / ( tpC2 + fnC2 )
	recallIKEA_IT = tpIKEA_IT / ( tpIKEA_IT + fnIKEA_IT)
	recallIKEA_EN = tpIKEA_EN / ( tpIKEA_EN + fnIKEA_EN)
	recalls = [recallC2,recallIKEA_EN,recallIKEA_IT]

	avgpre = sum(precisions)/3
	avgrec = sum(recalls)/3



	f.write("\n\ncorrect: " + str(correct) + "out of" + str(number))
	f.write("\nprecision: " + str(avgpre))
	f.write("\nrecall: " + str(avgrec))
	f.write("\nF1: " + str( 2* ((avgpre*avgrec) / (avgpre + avgrec)) ))

	f.close()