def test(self, printTags=False, maxLines=0): lines = 0 accepted = 0 errors = 0 for i in range(3): self.reader.next() for row in self.reader: title = row[6] body = row[7] question = (title, body) tags = [] for i in range(8, 13): if len(row[i]) > 0: tags += [row[i]] features = util.featureIt(util.tokenizeIt(title)) features.update(util.featureIt(util.tokenizeIt(body))) maxProbTags = self.getMax(features) inputTags = [row[8], row[9], row[10], row[11], row[12]] # print "\t-------- TEST %d" % lines # print "Possible tags to the question #%s" % row[0] # print maxProbTags # print "Tags added" # print inputTags if printTags: print "\nTEST %d" % lines #print row[7] print "Tags found (%d): " % len(maxProbTags) print maxProbTags print "Tags marked:" print inputTags print '\n' for tag in inputTags: if not tag == "": if tag in maxProbTags: accepted += 1 else: errors += 1 print("Testing line %d. Acc: %d, err: %d") % (lines, accepted, errors) lines += 1 if lines > maxLines and maxLines > 0: break print "Accepted: %d" % accepted print "Errors: %d" % errors
def test(self,printTags=False,maxLines=0): lines = 0 accepted = 0 errors = 0 for i in range(3): self.reader.next() for row in self.reader: title = row[6] body = row[7] question = (title, body) tags = [] for i in range(8,13): if len(row[i])>0: tags += [row[i]] features = util.featureIt(util.tokenizeIt(title)) features.update(util.featureIt(util.tokenizeIt(body))) maxProbTags = self.getMax(features) inputTags = [row[8],row[9],row[10],row[11],row[12]] # print "\t-------- TEST %d" % lines # print "Possible tags to the question #%s" % row[0] # print maxProbTags # print "Tags added" # print inputTags if printTags: print "\nTEST %d" % lines #print row[7] print "Tags found (%d): " % len(maxProbTags) print maxProbTags print "Tags marked:" print inputTags print '\n' for tag in inputTags: if not tag=="": if tag in maxProbTags: accepted+=1 else: errors+=1 print ("Testing line %d. Acc: %d, err: %d") % (lines,accepted, errors) lines+=1 if lines>maxLines and maxLines>0: break print "Accepted: %d" % accepted print "Errors: %d" % errors
def newFeatureset(self,filename): f = open(filename,"rb") self.csvReader = csv.reader(f) self.csvReader.next() featureset = [] lines = 0 for row in self.csvReader: title = row[6] body = row[7] tags = [] for i in range(8,13): if len(row[i])>0: tags += [row[i]] featureset += util.featuresetIt( util.featureIt(util.tokenizeIt(title)), tags ) + util.featuresetIt( util.featureIt(util.tokenizeIt(body)), tags ) lines+=1 if MAX_LOADED_LINES>0 and lines>MAX_LOADED_LINES: print "Maximum exceeded!" break print "%d lines parsed." % lines #break self.saveFeaturesetFile(featureset,filename+".featureset") return featureset
def getProbI(self,question): (title,body) = question fe = util.featureIt(util.tokenizeIt(title)+util.tokenizeIt(body)) return self.naive.prob_classify(fe)