def getSplitData(self,node): qid = DataExtrator.getAttrValue(node, 'QID') qcategory = DataExtrator.getAttrValue(node, 'QCATEGORY') quserid = DataExtrator.getAttrValue(node, 'QUSERID') qtype = DataExtrator.getAttrValue(node, 'QTYPE') qgold_yn = DataExtrator.getAttrValue(node, 'QGOLD_YN') # print qid,qcategory,quserid,qtype,qgold_yn qBody=DataExtrator.getNodeValue(DataExtrator.getXMLNode(node, 'QBody')[0]) commentNodeList=DataExtrator.getXMLNode(node, 'Comment') return qid,qcategory,quserid,qtype,qgold_yn,qBody,commentNodeList
def getPredictsData(self,traiFileName,testFileName,generalFile,yesnoFile): trainRoot = minidom.parse(traiFileName).documentElement testRoot = minidom.parse(testFileName).documentElement testQuestions = DataExtrator.getXMLNode(testRoot, 'Question') trainQuestions = DataExtrator.getXMLNode(trainRoot, 'Question') for i in range(0,len(testQuestions),1): allLevelDict={} testQid,testQcategory,testQuserid,testQtype,testQgold_yn,testQBody,testCommentNodeList=self.getSplitData(testQuestions[i]) # print testDict for node in trainQuestions: trainQid,trainQcategory,trainQuserid,trainQtype,trainQgold_yn,trainQBody,trainCommentNodeList=self.getSplitData(node) # if testQcategory==trainQcategory and trainQtype==testQtype: # if testQcategory==trainQcategory and trainQtype==testQtype and testQtype=='YES_NO': if testQcategory==trainQcategory and trainQtype==testQtype: trainData,topNWords=self.generateFeatures(trainQtype,trainQBody,trainCommentNodeList) testData,cidList=self.getTestData(testQtype,testQBody,testCommentNodeList,topNWords) # print trainQid,trainData if trainData: classifier = nltk.NaiveBayesClassifier.train(trainData) # print nltk.classify.accuracy(classifier,testData) # classifier.classify_many(testData) # print classifier.classify(testDict) labelList=sorted(classifier.labels()) # print labelList pdist=classifier.prob_classify_many(testData) print('[TEST]:%s - %s \r\n[TRAIN]:%s - %s' %(testQid,testQBody,trainQid,trainQBody)) for j in range(0,len(pdist),1): for label in labelList: if pdist[j].prob(label)!=0 and pdist[j].prob(label)!=1 and pdist[j].prob(label)!=0.5:#去除prob为0和1的值 self.addDict(allLevelDict,cidList[j],questionMarks[label],pdist[j].prob(label)) print('第%s个comment属于%s的概率: %.4f' %(j+1,questionMarks[label],pdist[j].prob(label))) else: print 'It does not have dataset..' # break; # print 'The 1th round of loop ended' if testQtype=='YES_NO': self.appendFile(allLevelDict,yesnoFile); else: self.appendFile(allLevelDict,generalFile); print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Question number'+bytes(i+1)+' finished'
def getTestData(self,testQtype,testQBody,testCommentNodeList,trainTopNWords): testData=[] cidList=[] for commentNode in testCommentNodeList: cid = DataExtrator.getAttrValue(commentNode, 'CID') cuser = DataExtrator.getAttrValue(commentNode, 'CUSERID') cgold = DataExtrator.getAttrValue(commentNode, 'CGOLD') cgold_yn = DataExtrator.getAttrValue(commentNode, 'CGOLD_YN') cBody=DataExtrator.getNodeValue(DataExtrator.getXMLNode(commentNode, 'CBody')[0]) testTopNWords,splitWords=self.selectTopNWords(testQBody+cBody); allFeatures=self.getFeatures(trainTopNWords,[[testTopNWords]]); #这里仅为利用已有方法而做的结构变化 testData.append(allFeatures[0][0]) cidList.append(cid) return testData,cidList
def generateFeatures(self,qtype,qBody,commentNodeList): allList=[] goodComments=[] badComments=[] potentialComments=[] dialogueComments=[] nonEnglishComments=[] otherComments=[] yesComments=[] noComments=[] unsureComments=[] naComments=[] noDisplayComments=[] # qBodyFeatures=DataExtrator.selectTopNWords(qBody); for node in commentNodeList: cid = DataExtrator.getAttrValue(node, 'CID') cuser = DataExtrator.getAttrValue(node, 'CUSERID') cgold = DataExtrator.getAttrValue(node, 'CGOLD') cgold_yn = DataExtrator.getAttrValue(node, 'CGOLD_YN') cBody=DataExtrator.getNodeValue(DataExtrator.getXMLNode(node, 'CBody')[0]) if qtype == general: if cgold==questionMarks[0]: goodComments.append(cBody) elif cgold==questionMarks[1]: badComments.append(cBody) elif cgold==questionMarks[2]: potentialComments.append(cBody) elif cgold==questionMarks[3]: dialogueComments.append(cBody) elif cgold==questionMarks[4]: nonEnglishComments.append(cBody) elif cgold==questionMarks[5]: otherComments.append(cBody) #NA问题没必要进行相似运算 # elif cgold==questionMarks[9]: # naComments.append(cBody) else: noDisplayComments.append(cBody) elif qtype == yesno:#yes_no型问题暂未考虑ctype if cgold_yn==questionMarks[6]: yesComments.append(cBody) elif cgold_yn==questionMarks[7]: noComments.append(cBody) elif cgold_yn==questionMarks[8]: unsureComments.append(cBody) # elif cgold_yn==questionMarks[9]: # naComments.append(cBody) else: noDisplayComments.append(cBody) #暂时不加入 # noDisplayComments.append(cBody) qBodyList=[qBody] allList.append(qBodyList) allList.append(goodComments) allList.append(badComments) allList.append(potentialComments) allList.append(dialogueComments) allList.append(nonEnglishComments) allList.append(otherComments) allList.append(yesComments) allList.append(noComments) allList.append(unsureComments) # allList.append(naComments)#N/A答案没必要进行相似运算 allList.append(noDisplayComments) topNWords,splitWords=self.selectTopNWords(allList) splitWords.pop(0) allFeatures=self.getFeatures(topNWords,splitWords); return allFeatures,topNWords;