Ejemplo n.º 1
0
 def __init__(self):
     self.neoo4jDAO = Neo4jDAO()
     self.dep = Dependency()
     self.wt={}
     self.temp = 0
Ejemplo n.º 2
0
class PostFn:
    '''
    contains all the post processing functions
    '''
    def __init__(self):
        self.neoo4jDAO = Neo4jDAO()
        self.dep = Dependency()
        self.wt={}
        self.temp = 0
        
    def insertToDB(self,depGraphList):
        '''
        Extract each pair of words from the dependency list , 
            calculate the chi-square value and insert to the database. 
        :param depGraphList : list - representation of dependency graph.
                             eg : [[(word1,word2),(word3,word4)...],[(wordi),(wordi+1) ...],...]
        '''        
        for dlist in depGraphList:
            for dtuple in dlist:
                parent = dtuple[0]
                child = dtuple[1]  
                rel = self._chiSq(depGraphList,parent, child)
                self.dep.setAll(parent, child, rel)
                self.neoo4jDAO.insert(self.dep)
        
    def _chiSq_temp(self, word1, word2):
        '''
        calculate the chi-square value of word1 and word2.
        :param word1 : string - first word 
        :param word2 : string - second word
        :return value: chi-square value of word1 and word2
        '''
        try:
            syn1 = wordnet.synsets(word1)[0]
            syn2 = wordnet.synsets(word2)[0]
            rel = syn1.wup_similarity(syn2)
        except:
            rel = 0.0050 # threshold for chi-square test
        if(rel == -1):
            rel = 0.2500
        return rel
    
    def fetchSenses(self, wsdWord):
        '''
        fetches all senses from wordnet corresponding to wsdWord
        return 0 if the word is not found in wordNet
        '''
        syns= wordnet.synsets(wsdWord)
        senseList=[]
        for syn in syns:
            senseList.append(syn.definition)
        return senseList
        
    def createSenseTree(self, senseList):
        '''
        create parse tree for all senses in senseList
        eg: {'conduct': ['institution', 'to', 'business'], 
        'ROOT': ['created'], 'institution': ['an'], 'created': ['institution', 'conduct']}
        '''
        senseDict = []
        depParsed = parseSenses(senseList)
        for dep in depParsed:
            temp = defaultdict( list )
            for n ,v in dep:
                n = stemWords(n)
                v = stemWords(v)
                temp[n].append(v)
            senseDict.append(temp)
        return senseDict
    
    def createWSDTextTree(self, wsdText):
        '''
        create parse tree for the wsdText.
        wsdText should be a single sentence.
        Else only parse tree of first sentence will be returned  
        '''
        tempList = []
        tempList.append(wsdText)
        tempDict = self.createSenseTree(tempList) # avoiding unnecessary 2d list :-)
        return tempDict[0]

    def calulateWeightSense(self,parent, level,l):
        if(level == 1 and parent == 'ROOT'):
            return
        if(level == 0):
            self.wt={}
        level+=1
        a=l[parent]
        if(len(a) != 0):
            for i in a:
                self.wt[i]=1.0/level
                self.calulateWeightSense(i,level,l)
        return self.wt
    
    def calulateWeightSense1(self,parent, level,l):
        if(level == self.temp and parent == 'ROOT'):
            return
        if(level == self.temp-1):
            self.wt={}
        level+=1 
        a=l[parent]
        if(len(a) != 0):
            for i in a:
                self.wt[i]=1.0/level
                self.calulateWeightSense(i,level,l)
        return self.wt
    

    def depScore(self, senseList, wsdText, wsdWord, senseTrees, wsdTextTree):
        '''
        calculate DepScore and return the index number of the sense with largest DepScore
        Process: Search the KB for each word in each sense
        '''
        #search the KB for each word in each sense
        score = []
        l = len(senseList)
        wsdText = wsdText.split()
        try:
            wtWSDText = self.calculateWeightWSDText(wsdWord, wsdTextTree)
            for i in range(0,l):
                tempScore = 0.0
                wtSense = self.calulateWeightSense('ROOT', 0, senseTrees[i])
                sense = senseList[i].split()
                for word in sense:
                    deps = self.neoo4jDAO.findDependent(word)
                    if(deps != None):
                        for tup in deps:
                            if(str(tup[0]) in wsdText):
                                node = str(tup[0])
                                try:
                                    wts=float(wtSense[word])
                                    wtt =float(wtWSDText[node])
                                    tempScore += float(tup[1])* wts* wtt
                                except:
                                    tempScore +=0
                    else:
                        continue
                score.append(tempScore)
            return score.index(max(score))
        except:
            return 0
    
    def glossScore(self, senseList, wsdText, wsdWord ,senseTrees, wsdTextTree ):
        score =[]
        l = len(senseList)
        wsdText = wsdText.split()
        try:
            wtWSDText = self.calculateWeightWSDText(wsdWord, wsdTextTree)
            for i in range(0,l):
                tempScore = 0.0
                wtSense = self.calulateWeightSense('ROOT', 0, senseTrees[i])
                sense = senseList[i].lower().split()  
                sense = removeStopWords(sense) 
                for word in sense:
                    word = WordNetLemmatizer().lemmatize(word,'v')  # stemming the word
                    if(word in wsdText):
                        try:
                            tempScore += float(wtSense[word]) + float(wtWSDText[word])
                        except:
                            tempScore += 0
                score.append(tempScore)
            return score.index(max(score))
        except:
            return 0
    
    def getLevel(self,word,tree,parent, level):
        if(level == 1 and parent == 'ROOT'):
            return 
        level+=1
        a=tree[parent]
        if(len(a) != 0):
            for i in a:
                if(i == word):
                    self.temp = level
                self.getLevel(word,tree,i,level)
        return self.temp
    
    def calculateWeightWSDText(self, word, tree):
        self.wt = {}
        temp ={}
        wtTemp = self.calulateWeightSense(word, 0, tree)
        temp.update(wtTemp)
        level = self.getLevel(word, tree, 'ROOT', 0)
        if (level != 0):
            level -= 1
        wtt = self.calulateWeightSense1("ROOT", level, tree)
        try:
            wtt.update(temp)
        except:
            pass
        return wtt
        
    def _chiSq(self,depGraphList, word1 ,word2):
        bigram_measures = BigramAssocMeasures()
    
        firstTuple = word1
        secondTuple = word2
        depGraphList = depGraphList[0]
        depLength = len(depGraphList)
        # value of n11
        i = 0
        j = 0
        count1 = 0
        count2 = 0
        count3 = 0
        for j in range(depLength):
            if (firstTuple == depGraphList[i][0] or firstTuple == depGraphList[i][1]) and (secondTuple == depGraphList[i][0] or secondTuple == depGraphList[i][1]):    
                count1 = count1+1
            else:
                count1 = count1
            i = i+1
            j = j+1
        cnt1 = count1
        # value of n12
        i = 0
        j = 0
        for j in range(depLength):
            if firstTuple == depGraphList[i][0] or firstTuple == depGraphList[i][1]:
                count2 = count2+1
            else:
                count2 = count2
            i = i+1
            j = j+1
        cnt2 = count2-1
        #value of n21
        i = 0
        j = 0
        for j in range(depLength):
            if secondTuple == depGraphList[i][0] or secondTuple == depGraphList[i][1]:
                count3 = count3+1
            else:
                count3 = count3
            i = i+1
            j = j+1
        cnt3 = count3-1
        #value of n22
        cnt4 = depLength-cnt1-cnt2-cnt3
        #total of n11 & n12
        n1p = cnt1+cnt2
        #total of n21 & n22
        n2p = cnt3+cnt4
        #total of n11 & n21
        np1 = cnt1+cnt3
        #total of n12 & n22
        np2 = cnt2+cnt4
        # Equatio of chi square test=> X^2 = [N(n11 * n22 - n12 * n21)^2]/[n1. * n2. * n.1 * n.2]
        x2 = float(bigram_measures.chi_sq(cnt1,(np1,n1p),depLength))
        if( x2 < 0):
            x2 = -x2
        return x2

    
    def createMarkup(self, sense , wsdText, wsdWord):
        temp = trim(wsdText)
        if(temp!=0):
            sense =temp
        html = '<h4 style="color:white">'+ wsdWord + ' : ' + sense + ' </h4>'
        return html