Esempio n. 1
0
    def test(self, printTags=False, maxLines=0):
        lines = 0
        accepted = 0
        errors = 0
        for i in range(3):
            self.reader.next()
        for row in self.reader:
            title = row[6]
            body = row[7]
            question = (title, body)
            tags = []
            for i in range(8, 13):
                if len(row[i]) > 0:
                    tags += [row[i]]
            features = util.featureIt(util.tokenizeIt(title))
            features.update(util.featureIt(util.tokenizeIt(body)))

            maxProbTags = self.getMax(features)
            inputTags = [row[8], row[9], row[10], row[11], row[12]]
            #			print "\t-------- TEST %d" % lines
            #			print "Possible tags to the question #%s" % row[0]
            #			print maxProbTags
            #			print "Tags added"
            #			print inputTags
            if printTags:
                print "\nTEST %d" % lines
                #print row[7]
                print "Tags found (%d): " % len(maxProbTags)
                print maxProbTags
                print "Tags marked:"
                print inputTags
                print '\n'

            for tag in inputTags:
                if not tag == "":
                    if tag in maxProbTags:
                        accepted += 1
                    else:
                        errors += 1
            print("Testing line %d. Acc: %d, err: %d") % (lines, accepted,
                                                          errors)
            lines += 1
            if lines > maxLines and maxLines > 0:
                break
        print "Accepted: %d" % accepted
        print "Errors: %d" % errors
Esempio n. 2
0
	def test(self,printTags=False,maxLines=0):
		lines = 0
		accepted = 0
		errors = 0
		for i in range(3):
			self.reader.next()
		for row in self.reader:
			title = row[6]
			body = row[7]
			question = (title, body)
			tags = []
			for i in range(8,13):
				if len(row[i])>0:
					tags += [row[i]]
			features = util.featureIt(util.tokenizeIt(title))
			features.update(util.featureIt(util.tokenizeIt(body)))

			maxProbTags = self.getMax(features)
			inputTags = [row[8],row[9],row[10],row[11],row[12]]
#			print "\t-------- TEST %d" % lines
#			print "Possible tags to the question #%s" % row[0]
#			print maxProbTags
#			print "Tags added"
#			print inputTags
			if printTags:
				print "\nTEST %d" % lines
				#print row[7]
				print "Tags found (%d): " % len(maxProbTags)
				print maxProbTags
				print "Tags marked:"
				print inputTags
				print '\n'

			for tag in inputTags:
				if not tag=="":
					if tag in maxProbTags:
						accepted+=1
					else:
						errors+=1
			print ("Testing line %d. Acc: %d, err: %d") % (lines,accepted, errors)
			lines+=1
			if lines>maxLines and maxLines>0:
				break
		print "Accepted: %d" % accepted
		print "Errors: %d" % errors		
Esempio n. 3
0
	def newFeatureset(self,filename):
		f = open(filename,"rb")
		self.csvReader = csv.reader(f)
		self.csvReader.next()
		featureset = []
		lines = 0
		for row in self.csvReader:
			title = row[6]
			body = row[7]
			tags = []
			for i in range(8,13):
				if len(row[i])>0:
					tags += [row[i]]
			featureset += util.featuresetIt( util.featureIt(util.tokenizeIt(title)), tags ) + util.featuresetIt( util.featureIt(util.tokenizeIt(body)), tags )
			lines+=1
			if MAX_LOADED_LINES>0 and lines>MAX_LOADED_LINES:
				print "Maximum exceeded!"
				break
			print "%d lines parsed." % lines
			#break
		self.saveFeaturesetFile(featureset,filename+".featureset")
		return featureset
Esempio n. 4
0
	def getProbI(self,question):
		(title,body) = question	
		fe = util.featureIt(util.tokenizeIt(title)+util.tokenizeIt(body))
		return self.naive.prob_classify(fe)