Beispiel #1
0
    def getSplitData(self,node):
        qid = DataExtrator.getAttrValue(node, 'QID')
        qcategory = DataExtrator.getAttrValue(node, 'QCATEGORY')
        quserid = DataExtrator.getAttrValue(node, 'QUSERID')
        qtype = DataExtrator.getAttrValue(node, 'QTYPE')
        qgold_yn = DataExtrator.getAttrValue(node, 'QGOLD_YN')
#         print qid,qcategory,quserid,qtype,qgold_yn
        qBody=DataExtrator.getNodeValue(DataExtrator.getXMLNode(node, 'QBody')[0])
        commentNodeList=DataExtrator.getXMLNode(node, 'Comment')
        return qid,qcategory,quserid,qtype,qgold_yn,qBody,commentNodeList
Beispiel #2
0
    def getPredictsData(self,traiFileName,testFileName,generalFile,yesnoFile):
        trainRoot = minidom.parse(traiFileName).documentElement
        testRoot = minidom.parse(testFileName).documentElement
    
        testQuestions = DataExtrator.getXMLNode(testRoot, 'Question')
        trainQuestions = DataExtrator.getXMLNode(trainRoot, 'Question')
        for i in range(0,len(testQuestions),1):
            allLevelDict={}
            testQid,testQcategory,testQuserid,testQtype,testQgold_yn,testQBody,testCommentNodeList=self.getSplitData(testQuestions[i])
#             print testDict
            for node in trainQuestions:
                trainQid,trainQcategory,trainQuserid,trainQtype,trainQgold_yn,trainQBody,trainCommentNodeList=self.getSplitData(node)
#                 if testQcategory==trainQcategory and trainQtype==testQtype:
#                 if testQcategory==trainQcategory and trainQtype==testQtype and testQtype=='YES_NO':
                if testQcategory==trainQcategory and trainQtype==testQtype:
                    trainData,topNWords=self.generateFeatures(trainQtype,trainQBody,trainCommentNodeList)
                    testData,cidList=self.getTestData(testQtype,testQBody,testCommentNodeList,topNWords)
    #                 print trainQid,trainData
                    if trainData:
                        classifier = nltk.NaiveBayesClassifier.train(trainData)
    #                     print nltk.classify.accuracy(classifier,testData)
    #                     classifier.classify_many(testData)
    #                     print classifier.classify(testDict)
                        labelList=sorted(classifier.labels())
#                         print labelList
                        pdist=classifier.prob_classify_many(testData)
                        print('[TEST]:%s - %s \r\n[TRAIN]:%s - %s' %(testQid,testQBody,trainQid,trainQBody))
                        for j in range(0,len(pdist),1):
                            for label in labelList:
                                if pdist[j].prob(label)!=0 and pdist[j].prob(label)!=1 and pdist[j].prob(label)!=0.5:#去除prob为0和1的值
                                    self.addDict(allLevelDict,cidList[j],questionMarks[label],pdist[j].prob(label))
                                print('第%s个comment属于%s的概率: %.4f' %(j+1,questionMarks[label],pdist[j].prob(label)))
                    else:
                        print 'It does not have dataset..'
#             break;
#             print 'The 1th round of loop ended'
            if testQtype=='YES_NO':
                self.appendFile(allLevelDict,yesnoFile);
            else:
                self.appendFile(allLevelDict,generalFile);
                
            print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Question number'+bytes(i+1)+' finished'
            
Beispiel #3
0
 def getTestData(self,testQtype,testQBody,testCommentNodeList,trainTopNWords):
     testData=[]
     cidList=[]
     for commentNode in testCommentNodeList:
         cid = DataExtrator.getAttrValue(commentNode, 'CID')
         cuser = DataExtrator.getAttrValue(commentNode, 'CUSERID')
         cgold = DataExtrator.getAttrValue(commentNode, 'CGOLD')
         cgold_yn = DataExtrator.getAttrValue(commentNode, 'CGOLD_YN')
         cBody=DataExtrator.getNodeValue(DataExtrator.getXMLNode(commentNode, 'CBody')[0])
         testTopNWords,splitWords=self.selectTopNWords(testQBody+cBody);
         allFeatures=self.getFeatures(trainTopNWords,[[testTopNWords]]);
         #这里仅为利用已有方法而做的结构变化
         testData.append(allFeatures[0][0])
         cidList.append(cid)
     return testData,cidList
Beispiel #4
0
    def generateFeatures(self,qtype,qBody,commentNodeList):
        allList=[]
        goodComments=[]
        badComments=[]
        potentialComments=[]
        dialogueComments=[]
        nonEnglishComments=[]
        otherComments=[]
        yesComments=[]
        noComments=[]
        unsureComments=[]
        naComments=[]
        noDisplayComments=[]
#         qBodyFeatures=DataExtrator.selectTopNWords(qBody);
        
        for node in commentNodeList:
            cid = DataExtrator.getAttrValue(node, 'CID')
            cuser = DataExtrator.getAttrValue(node, 'CUSERID')
            cgold = DataExtrator.getAttrValue(node, 'CGOLD')
            cgold_yn = DataExtrator.getAttrValue(node, 'CGOLD_YN')
            cBody=DataExtrator.getNodeValue(DataExtrator.getXMLNode(node, 'CBody')[0])
            
            if qtype == general:
                if cgold==questionMarks[0]:
                    goodComments.append(cBody)
                elif cgold==questionMarks[1]:
                    badComments.append(cBody)
                elif cgold==questionMarks[2]:
                    potentialComments.append(cBody)
                elif cgold==questionMarks[3]:
                    dialogueComments.append(cBody)
                elif cgold==questionMarks[4]:
                    nonEnglishComments.append(cBody)
                elif cgold==questionMarks[5]:
                    otherComments.append(cBody)
                #NA问题没必要进行相似运算
#                 elif cgold==questionMarks[9]:
#                     naComments.append(cBody)
                else:
                    noDisplayComments.append(cBody)
            elif qtype == yesno:#yes_no型问题暂未考虑ctype
                if cgold_yn==questionMarks[6]:
                    yesComments.append(cBody)
                elif cgold_yn==questionMarks[7]:
                    noComments.append(cBody)
                elif cgold_yn==questionMarks[8]:
                    unsureComments.append(cBody)
#                 elif cgold_yn==questionMarks[9]:
#                     naComments.append(cBody)
                else:
                    noDisplayComments.append(cBody)
            #暂时不加入
#             noDisplayComments.append(cBody)
        qBodyList=[qBody]
        allList.append(qBodyList)
        
        allList.append(goodComments)
        allList.append(badComments)
        allList.append(potentialComments)
        allList.append(dialogueComments)
        allList.append(nonEnglishComments)
        allList.append(otherComments)
        
        allList.append(yesComments)
        allList.append(noComments)
        allList.append(unsureComments)
        
#         allList.append(naComments)#N/A答案没必要进行相似运算
        allList.append(noDisplayComments)
        
        topNWords,splitWords=self.selectTopNWords(allList)
        splitWords.pop(0)
        allFeatures=self.getFeatures(topNWords,splitWords);
        return allFeatures,topNWords;